From 0bba0e074f720a5d03027ee8fdf699f46ce8af82 Mon Sep 17 00:00:00 2001 From: bizrockman Date: Mon, 4 Nov 2024 20:12:24 +0100 Subject: [PATCH 001/115] Preventing NoneType has no attribute get Errors Sometimes the list contains Tag elements that do not have attrs set, resulting in this Error. --- crawl4ai/utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index baa08a0f..869c22d5 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -706,9 +706,12 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: body = flatten_nested_elements(body) base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)') for img in imgs: - src = img.get('src', '') - if base64_pattern.match(src): - img['src'] = base64_pattern.sub('', src) + try: + src = img.get('src', '') + if base64_pattern.match(src): + img['src'] = base64_pattern.sub('', src) + except: + pass cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ') cleaned_html = sanitize_html(cleaned_html) From a28046c233059c3dc2c4ce442e5cda6f7f18645b Mon Sep 17 00:00:00 2001 From: bizrockman Date: Mon, 4 Nov 2024 20:18:26 +0100 Subject: [PATCH 002/115] Rename episode_08_Media_Handling:_Images,_Videos,_and_Audio.md to episode_08_Media_Handling_Images_Videos_and_Audio.md Name that will work in Windows --- ....md => episode_08_Media_Handling_Images_Videos_and_Audio.md} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename docs/md_v2/tutorial/{episode_08_Media_Handling:_Images,_Videos,_and_Audio.md => episode_08_Media_Handling_Images_Videos_and_Audio.md} (99%) diff --git a/docs/md_v2/tutorial/episode_08_Media_Handling:_Images,_Videos,_and_Audio.md b/docs/md_v2/tutorial/episode_08_Media_Handling_Images_Videos_and_Audio.md similarity index 99% rename from docs/md_v2/tutorial/episode_08_Media_Handling:_Images,_Videos,_and_Audio.md rename to docs/md_v2/tutorial/episode_08_Media_Handling_Images_Videos_and_Audio.md index c3a57009..c0daacad 100644 --- a/docs/md_v2/tutorial/episode_08_Media_Handling:_Images,_Videos,_and_Audio.md +++ b/docs/md_v2/tutorial/episode_08_Media_Handling_Images_Videos_and_Audio.md @@ -113,4 +113,4 @@ Here’s a clear and focused outline for the **Media Handling: Images, Videos, a --- -This outline provides users with a complete guide to handling images, videos, and audio in Crawl4AI, using metadata to enhance relevance and precision in multimedia extraction. \ No newline at end of file +This outline provides users with a complete guide to handling images, videos, and audio in Crawl4AI, using metadata to enhance relevance and precision in multimedia extraction. From 870296fa7ee43b221cdede34dbe22a8a2ea4ea4c Mon Sep 17 00:00:00 2001 From: bizrockman Date: Mon, 4 Nov 2024 20:18:58 +0100 Subject: [PATCH 003/115] Rename episode_11_1_Extraction_Strategies:_JSON_CSS.md to episode_11_1_Extraction_Strategies_JSON_CSS.md Name that will work in Windows --- ...ON_CSS.md => episode_11_1_Extraction_Strategies_JSON_CSS.md} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename docs/md_v2/tutorial/{episode_11_1_Extraction_Strategies:_JSON_CSS.md => episode_11_1_Extraction_Strategies_JSON_CSS.md} (99%) diff --git a/docs/md_v2/tutorial/episode_11_1_Extraction_Strategies:_JSON_CSS.md b/docs/md_v2/tutorial/episode_11_1_Extraction_Strategies_JSON_CSS.md similarity index 99% rename from docs/md_v2/tutorial/episode_11_1_Extraction_Strategies:_JSON_CSS.md rename to docs/md_v2/tutorial/episode_11_1_Extraction_Strategies_JSON_CSS.md index a8a357af..b460ff8c 100644 --- a/docs/md_v2/tutorial/episode_11_1_Extraction_Strategies:_JSON_CSS.md +++ b/docs/md_v2/tutorial/episode_11_1_Extraction_Strategies_JSON_CSS.md @@ -183,4 +183,4 @@ Here’s a detailed outline for the **JSON-CSS Extraction Strategy** video, cove --- -This outline covers each JSON-CSS Extraction option in Crawl4AI, with practical examples and schema configurations, making it a thorough guide for users. \ No newline at end of file +This outline covers each JSON-CSS Extraction option in Crawl4AI, with practical examples and schema configurations, making it a thorough guide for users. From 3a3c88a2d0d76141179d9284d43021083d1e663b Mon Sep 17 00:00:00 2001 From: bizrockman Date: Mon, 4 Nov 2024 20:19:20 +0100 Subject: [PATCH 004/115] Rename episode_11_2_Extraction_Strategies:_LLM.md to episode_11_2_Extraction_Strategies_LLM.md Name that will work in Windows --- ...tegies:_LLM.md => episode_11_2_Extraction_Strategies_LLM.md} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename docs/md_v2/tutorial/{episode_11_2_Extraction_Strategies:_LLM.md => episode_11_2_Extraction_Strategies_LLM.md} (99%) diff --git a/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies:_LLM.md b/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md similarity index 99% rename from docs/md_v2/tutorial/episode_11_2_Extraction_Strategies:_LLM.md rename to docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md index 900c32f2..3682425f 100644 --- a/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies:_LLM.md +++ b/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md @@ -150,4 +150,4 @@ Here’s a comprehensive outline for the **LLM Extraction Strategy** video, cove --- -This outline explains LLM Extraction in Crawl4AI, with examples showing how to extract structured data using custom schemas and instructions. It demonstrates flexibility with multiple providers, ensuring practical application for different use cases. \ No newline at end of file +This outline explains LLM Extraction in Crawl4AI, with examples showing how to extract structured data using custom schemas and instructions. It demonstrates flexibility with multiple providers, ensuring practical application for different use cases. From 796dbaf08c92efd606c5b82d00168c29702f6927 Mon Sep 17 00:00:00 2001 From: bizrockman Date: Mon, 4 Nov 2024 20:19:43 +0100 Subject: [PATCH 005/115] Rename episode_11_3_Extraction_Strategies:_Cosine.md to episode_11_3_Extraction_Strategies_Cosine.md Name that will work in Windows --- ...:_Cosine.md => episode_11_3_Extraction_Strategies_Cosine.md} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename docs/md_v2/tutorial/{episode_11_3_Extraction_Strategies:_Cosine.md => episode_11_3_Extraction_Strategies_Cosine.md} (99%) diff --git a/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies:_Cosine.md b/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md similarity index 99% rename from docs/md_v2/tutorial/episode_11_3_Extraction_Strategies:_Cosine.md rename to docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md index 61e210e4..9f1c00ea 100644 --- a/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies:_Cosine.md +++ b/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md @@ -133,4 +133,4 @@ Here’s a structured outline for the **Cosine Similarity Strategy** video, cove --- -This outline covers Cosine Similarity Strategy’s speed and effectiveness, providing examples that showcase its potential for clustering various content types efficiently. \ No newline at end of file +This outline covers Cosine Similarity Strategy’s speed and effectiveness, providing examples that showcase its potential for clustering various content types efficiently. From 9f5eef1f3890094a4df707458fa611a83398429d Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 6 Nov 2024 21:50:09 +0800 Subject: [PATCH 006/115] Refactored the `CustomHTML2Text` class in `content_scrapping_strategy.py` to remove the handling logic for header tags (h1-h6), which are now commented out. This cleanup improves code readability and reduces maintenance overhead. --- crawl4ai/content_scrapping_strategy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index 66b3ad91..caed7319 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -93,8 +93,8 @@ class CustomHTML2Text(HTML2Text): else: self.o('\n```') self.inside_pre = False - elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: - pass + # elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: + # pass else: super().handle_tag(tag, attrs, start) From 2879344d9ccc281054587c079a5d5d2a2245b60a Mon Sep 17 00:00:00 2001 From: devatnull Date: Wed, 6 Nov 2024 17:36:46 +0300 Subject: [PATCH 007/115] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 28563762..a0e8b005 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scrapper +# 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scraper unclecode%2Fcrawl4ai | Trendshift @@ -480,4 +480,4 @@ For a detailed exploration of our vision, challenges, and solutions, please see ## Star History -[![Star History Chart](https://api.star-history.com/svg?repos=unclecode/crawl4ai&type=Date)](https://star-history.com/#unclecode/crawl4ai&Date) \ No newline at end of file +[![Star History Chart](https://api.star-history.com/svg?repos=unclecode/crawl4ai&type=Date)](https://star-history.com/#unclecode/crawl4ai&Date) From f7574230a11278fef07f7dcaeb29a3b45752380f Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 7 Nov 2024 19:29:31 +0800 Subject: [PATCH 008/115] Update API server request object. text_docker file and Readme --- README.md | 2 +- main.py | 3 ++- tests/test_docker.py | 7 ++++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 28563762..1f36aca6 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Use the [Crawl4AI GPT Assistant](https://tinyurl.com/crawl4ai-gpt) as your AI-po ## New in 0.3.73 ✨ - 🐳 Docker Ready: Full API server with seamless deployment & scaling -- 🎯 Smart Browser: Managed browser integration with CDP support +- 🎯 Browser Takeover: Use your own browser with cookies & history intact (CDP support) - 📝 Mockdown+: Enhanced tag preservation & content extraction - ⚡️ Parallel Power: Supercharged multi-URL crawling performance - 🌟 And many more exciting updates... diff --git a/main.py b/main.py index 3e32fe9c..853cd0b7 100644 --- a/main.py +++ b/main.py @@ -62,6 +62,7 @@ class CrawlRequest(BaseModel): css_selector: Optional[str] = None screenshot: bool = False magic: bool = False + extra: Optional[Dict[str, Any]] = {} @dataclass class TaskInfo: @@ -251,7 +252,7 @@ class CrawlerService: while True: try: available_slots = await self.resource_monitor.get_available_slots() - if available_slots <= 0: + if False and available_slots <= 0: await asyncio.sleep(1) continue diff --git a/tests/test_docker.py b/tests/test_docker.py index 913450ca..c22acd55 100644 --- a/tests/test_docker.py +++ b/tests/test_docker.py @@ -7,7 +7,7 @@ import os from typing import Dict, Any class Crawl4AiTester: - def __init__(self, base_url: str = "http://localhost:8000"): + def __init__(self, base_url: str = "http://localhost:11235"): self.base_url = base_url def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]: @@ -54,8 +54,9 @@ def test_docker_deployment(version="basic"): # Test cases based on version test_basic_crawl(tester) - if version in ["full", "transformer"]: - test_cosine_extraction(tester) + + # if version in ["full", "transformer"]: + # test_cosine_extraction(tester) # test_js_execution(tester) # test_css_selector(tester) From b120965b6ac9773f599a854f214aa16c1a7426b9 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 7 Nov 2024 20:15:03 +0800 Subject: [PATCH 009/115] Fixed issues with the Manage Browser, including its inability to connect to the user directory and inability to create new pages within the Manage Browser context; all issues are now resolved. --- crawl4ai/async_crawler_strategy.py | 143 +++++++++++++++++++---------- 1 file changed, 93 insertions(+), 50 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index e79c8268..fa50e7b5 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -187,6 +187,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.use_managed_browser = kwargs.get("use_managed_browser", False) self.user_data_dir = kwargs.get("user_data_dir", None) self.managed_browser = None + self.default_context = None self.hooks = { 'on_browser_created': None, 'on_user_agent_updated': None, @@ -217,6 +218,25 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): ) cdp_url = await self.managed_browser.start() self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) + + # Get the default context that maintains the user profile + contexts = self.browser.contexts + if contexts: + self.default_context = contexts[0] + else: + # If no default context exists, create one + self.default_context = await self.browser.new_context( + viewport={"width": 1920, "height": 1080} + ) + + # Set up the default context + if self.default_context: + await self.default_context.set_extra_http_headers(self.headers) + + if self.user_agent: + await self.default_context.set_extra_http_headers({ + "User-Agent": self.user_agent + }) else: browser_args = { "headless": self.headless, @@ -254,12 +274,20 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): async def close(self): if self.sleep_on_close: await asyncio.sleep(0.5) + + # Close all active sessions + session_ids = list(self.sessions.keys()) + for session_id in session_ids: + await self.kill_session(session_id) + if self.browser: await self.browser.close() self.browser = None + if self.managed_browser: await self.managed_browser.cleanup() self.managed_browser = None + if self.playwright: await self.playwright.stop() self.playwright = None @@ -293,7 +321,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if session_id in self.sessions: context, page, _ = self.sessions[session_id] await page.close() - await context.close() + if not self.use_managed_browser: + await context.close() del self.sessions[session_id] def _cleanup_expired_sessions(self): @@ -415,61 +444,75 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self._cleanup_expired_sessions() session_id = kwargs.get("session_id") - if session_id: - context, page, _ = self.sessions.get(session_id, (None, None, None)) - if not context: + + # Handle page creation differently for managed browser + if self.use_managed_browser: + if session_id: + # Reuse existing session if available + context, page, _ = self.sessions.get(session_id, (None, None, None)) + if not page: + # Create new page in default context if session doesn't exist + page = await self.default_context.new_page() + self.sessions[session_id] = (self.default_context, page, time.time()) + else: + # Create new page in default context for non-session requests + page = await self.default_context.new_page() + else: + if session_id: + context, page, _ = self.sessions.get(session_id, (None, None, None)) + if not context: + context = await self.browser.new_context( + user_agent=self.user_agent, + viewport={"width": 1920, "height": 1080}, + proxy={"server": self.proxy} if self.proxy else None, + accept_downloads=True, + java_script_enabled=True + ) + await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) + await context.set_extra_http_headers(self.headers) + page = await context.new_page() + self.sessions[session_id] = (context, page, time.time()) + else: context = await self.browser.new_context( user_agent=self.user_agent, viewport={"width": 1920, "height": 1080}, - proxy={"server": self.proxy} if self.proxy else None, - accept_downloads=True, - java_script_enabled=True + proxy={"server": self.proxy} if self.proxy else None ) - await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) await context.set_extra_http_headers(self.headers) + + if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False): + # Inject scripts to override navigator properties + await context.add_init_script(""" + // Pass the Permissions Test. + const originalQuery = window.navigator.permissions.query; + window.navigator.permissions.query = (parameters) => ( + parameters.name === 'notifications' ? + Promise.resolve({ state: Notification.permission }) : + originalQuery(parameters) + ); + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }); + window.navigator.chrome = { + runtime: {}, + // Add other properties if necessary + }; + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5], + }); + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en'], + }); + Object.defineProperty(document, 'hidden', { + get: () => false + }); + Object.defineProperty(document, 'visibilityState', { + get: () => 'visible' + }); + """) + page = await context.new_page() - self.sessions[session_id] = (context, page, time.time()) - else: - context = await self.browser.new_context( - user_agent=self.user_agent, - viewport={"width": 1920, "height": 1080}, - proxy={"server": self.proxy} if self.proxy else None - ) - await context.set_extra_http_headers(self.headers) - - if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False): - # Inject scripts to override navigator properties - await context.add_init_script(""" - // Pass the Permissions Test. - const originalQuery = window.navigator.permissions.query; - window.navigator.permissions.query = (parameters) => ( - parameters.name === 'notifications' ? - Promise.resolve({ state: Notification.permission }) : - originalQuery(parameters) - ); - Object.defineProperty(navigator, 'webdriver', { - get: () => undefined - }); - window.navigator.chrome = { - runtime: {}, - // Add other properties if necessary - }; - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5], - }); - Object.defineProperty(navigator, 'languages', { - get: () => ['en-US', 'en'], - }); - Object.defineProperty(document, 'hidden', { - get: () => false - }); - Object.defineProperty(document, 'visibilityState', { - get: () => 'visible' - }); - """) - - page = await context.new_page() - # await stealth_async(page) #, stealth_config) + # await stealth_async(page) #, stealth_config) # Add console message and error logging if kwargs.get("log_console", False): From bcdd80911fff320b041921f3d25524aae103e79a Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 8 Nov 2024 19:08:58 +0800 Subject: [PATCH 010/115] Remove some old files. --- crawl4ai/train.py | 146 -------------- crawl4ai/web_crawler.back.py | 357 ----------------------------------- 2 files changed, 503 deletions(-) delete mode 100644 crawl4ai/train.py delete mode 100644 crawl4ai/web_crawler.back.py diff --git a/crawl4ai/train.py b/crawl4ai/train.py deleted file mode 100644 index f7e7c1a9..00000000 --- a/crawl4ai/train.py +++ /dev/null @@ -1,146 +0,0 @@ -import spacy -from spacy.training import Example -import random -import nltk -from nltk.corpus import reuters -import torch - -def save_spacy_model_as_torch(nlp, model_dir="models/reuters"): - # Extract the TextCategorizer component - textcat = nlp.get_pipe("textcat_multilabel") - - # Convert the weights to a PyTorch state dictionary - state_dict = {name: torch.tensor(param.data) for name, param in textcat.model.named_parameters()} - - # Save the state dictionary - torch.save(state_dict, f"{model_dir}/model_weights.pth") - - # Extract and save the vocabulary - vocab = extract_vocab(nlp) - with open(f"{model_dir}/vocab.txt", "w") as vocab_file: - for word, idx in vocab.items(): - vocab_file.write(f"{word}\t{idx}\n") - - print(f"Model weights and vocabulary saved to: {model_dir}") - -def extract_vocab(nlp): - # Extract vocabulary from the SpaCy model - vocab = {word: i for i, word in enumerate(nlp.vocab.strings)} - return vocab - -nlp = spacy.load("models/reuters") -save_spacy_model_as_torch(nlp, model_dir="models") - -def train_and_save_reuters_model(model_dir="models/reuters"): - # Ensure the Reuters corpus is downloaded - nltk.download('reuters') - nltk.download('punkt') - if not reuters.fileids(): - print("Reuters corpus not found.") - return - - # Load a blank English spaCy model - nlp = spacy.blank("en") - - # Create a TextCategorizer with the ensemble model for multi-label classification - textcat = nlp.add_pipe("textcat_multilabel") - - # Add labels to text classifier - for label in reuters.categories(): - textcat.add_label(label) - - # Prepare training data - train_examples = [] - for fileid in reuters.fileids(): - categories = reuters.categories(fileid) - text = reuters.raw(fileid) - cats = {label: label in categories for label in reuters.categories()} - # Prepare spacy Example objects - doc = nlp.make_doc(text) - example = Example.from_dict(doc, {'cats': cats}) - train_examples.append(example) - - # Initialize the text categorizer with the example objects - nlp.initialize(lambda: train_examples) - - # Train the model - random.seed(1) - spacy.util.fix_random_seed(1) - for i in range(5): # Adjust iterations for better accuracy - random.shuffle(train_examples) - losses = {} - # Create batches of data - batches = spacy.util.minibatch(train_examples, size=8) - for batch in batches: - nlp.update(batch, drop=0.2, losses=losses) - print(f"Losses at iteration {i}: {losses}") - - # Save the trained model - nlp.to_disk(model_dir) - print(f"Model saved to: {model_dir}") - -def train_model(model_dir, additional_epochs=0): - # Load the model if it exists, otherwise start with a blank model - try: - nlp = spacy.load(model_dir) - print("Model loaded from disk.") - except IOError: - print("No existing model found. Starting with a new model.") - nlp = spacy.blank("en") - textcat = nlp.add_pipe("textcat_multilabel") - for label in reuters.categories(): - textcat.add_label(label) - - # Prepare training data - train_examples = [] - for fileid in reuters.fileids(): - categories = reuters.categories(fileid) - text = reuters.raw(fileid) - cats = {label: label in categories for label in reuters.categories()} - doc = nlp.make_doc(text) - example = Example.from_dict(doc, {'cats': cats}) - train_examples.append(example) - - # Initialize the model if it was newly created - if 'textcat_multilabel' not in nlp.pipe_names: - nlp.initialize(lambda: train_examples) - else: - print("Continuing training with existing model.") - - # Train the model - random.seed(1) - spacy.util.fix_random_seed(1) - num_epochs = 5 + additional_epochs - for i in range(num_epochs): - random.shuffle(train_examples) - losses = {} - batches = spacy.util.minibatch(train_examples, size=8) - for batch in batches: - nlp.update(batch, drop=0.2, losses=losses) - print(f"Losses at iteration {i}: {losses}") - - # Save the trained model - nlp.to_disk(model_dir) - print(f"Model saved to: {model_dir}") - -def load_model_and_predict(model_dir, text, tok_k = 3): - # Load the trained model from the specified directory - nlp = spacy.load(model_dir) - - # Process the text with the loaded model - doc = nlp(text) - - # gee top 3 categories - top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k] - print(f"Top {tok_k} categories:") - - return top_categories - -if __name__ == "__main__": - train_and_save_reuters_model() - train_model("models/reuters", additional_epochs=5) - model_directory = "reuters_model_10" - print(reuters.categories()) - example_text = "Apple Inc. is reportedly buying a startup for $1 billion" - r =load_model_and_predict(model_directory, example_text) - print(r) \ No newline at end of file diff --git a/crawl4ai/web_crawler.back.py b/crawl4ai/web_crawler.back.py deleted file mode 100644 index af78f126..00000000 --- a/crawl4ai/web_crawler.back.py +++ /dev/null @@ -1,357 +0,0 @@ -import os, time -os.environ["TOKENIZERS_PARALLELISM"] = "false" -from pathlib import Path - -from .models import UrlModel, CrawlResult -from .database import init_db, get_cached_url, cache_url, DB_PATH, flush_db -from .utils import * -from .chunking_strategy import * -from .extraction_strategy import * -from .crawler_strategy import * -from typing import List -from concurrent.futures import ThreadPoolExecutor -from .config import * - - -class WebCrawler: - def __init__( - self, - # db_path: str = None, - crawler_strategy: CrawlerStrategy = None, - always_by_pass_cache: bool = False, - verbose: bool = False, - ): - # self.db_path = db_path - self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose) - self.always_by_pass_cache = always_by_pass_cache - - # Create the .crawl4ai folder in the user's home directory if it doesn't exist - self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") - os.makedirs(self.crawl4ai_folder, exist_ok=True) - os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) - - # If db_path is not provided, use the default path - # if not db_path: - # self.db_path = f"{self.crawl4ai_folder}/crawl4ai.db" - - # flush_db() - init_db() - - self.ready = False - - def warmup(self): - print("[LOG] 🌤️ Warming up the WebCrawler") - result = self.run( - url='https://crawl4ai.uccode.io/', - word_count_threshold=5, - extraction_strategy= NoExtractionStrategy(), - bypass_cache=False, - verbose = False - ) - self.ready = True - print("[LOG] 🌞 WebCrawler is ready to crawl") - - def fetch_page( - self, - url_model: UrlModel, - provider: str = DEFAULT_PROVIDER, - api_token: str = None, - extract_blocks_flag: bool = True, - word_count_threshold=MIN_WORD_THRESHOLD, - css_selector: str = None, - screenshot: bool = False, - use_cached_html: bool = False, - extraction_strategy: ExtractionStrategy = None, - chunking_strategy: ChunkingStrategy = RegexChunking(), - **kwargs, - ) -> CrawlResult: - return self.run( - url_model.url, - word_count_threshold, - extraction_strategy or NoExtractionStrategy(), - chunking_strategy, - bypass_cache=url_model.forced, - css_selector=css_selector, - screenshot=screenshot, - **kwargs, - ) - pass - - def run_old( - self, - url: str, - word_count_threshold=MIN_WORD_THRESHOLD, - extraction_strategy: ExtractionStrategy = None, - chunking_strategy: ChunkingStrategy = RegexChunking(), - bypass_cache: bool = False, - css_selector: str = None, - screenshot: bool = False, - user_agent: str = None, - verbose=True, - **kwargs, - ) -> CrawlResult: - if user_agent: - self.crawler_strategy.update_user_agent(user_agent) - extraction_strategy = extraction_strategy or NoExtractionStrategy() - extraction_strategy.verbose = verbose - # Check if extraction strategy is an instance of ExtractionStrategy if not raise an error - if not isinstance(extraction_strategy, ExtractionStrategy): - raise ValueError("Unsupported extraction strategy") - if not isinstance(chunking_strategy, ChunkingStrategy): - raise ValueError("Unsupported chunking strategy") - - # make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD - if word_count_threshold < MIN_WORD_THRESHOLD: - word_count_threshold = MIN_WORD_THRESHOLD - - # Check cache first - if not bypass_cache and not self.always_by_pass_cache: - cached = get_cached_url(url) - if cached: - return CrawlResult( - **{ - "url": cached[0], - "html": cached[1], - "cleaned_html": cached[2], - "markdown": cached[3], - "extracted_content": cached[4], - "success": cached[5], - "media": json.loads(cached[6] or "{}"), - "links": json.loads(cached[7] or "{}"), - "metadata": json.loads(cached[8] or "{}"), # "metadata": "{} - "screenshot": cached[9], - "error_message": "", - } - ) - - # Initialize WebDriver for crawling - t = time.time() - if kwargs.get("js", None): - self.crawler_strategy.js_code = kwargs.get("js") - html = self.crawler_strategy.crawl(url) - base64_image = None - if screenshot: - base64_image = self.crawler_strategy.take_screenshot() - success = True - error_message = "" - # Extract content from HTML - try: - result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector) - metadata = extract_metadata(html) - if result is None: - raise ValueError(f"Failed to extract content from the website: {url}") - except InvalidCSSSelectorError as e: - raise ValueError(str(e)) - - cleaned_html = result.get("cleaned_html", "") - markdown = result.get("markdown", "") - media = result.get("media", []) - links = result.get("links", []) - - # Print a profession LOG style message, show time taken and say crawling is done - if verbose: - print( - f"[LOG] 🚀 Crawling done for {url}, success: {success}, time taken: {time.time() - t} seconds" - ) - - extracted_content = [] - if verbose: - print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}") - t = time.time() - # Split markdown into sections - sections = chunking_strategy.chunk(markdown) - # sections = merge_chunks_based_on_token_threshold(sections, CHUNK_TOKEN_THRESHOLD) - - extracted_content = extraction_strategy.run( - url, sections, - ) - extracted_content = json.dumps(extracted_content) - - if verbose: - print( - f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds." - ) - - # Cache the result - cleaned_html = beautify_html(cleaned_html) - cache_url( - url, - html, - cleaned_html, - markdown, - extracted_content, - success, - json.dumps(media), - json.dumps(links), - json.dumps(metadata), - screenshot=base64_image, - ) - - return CrawlResult( - url=url, - html=html, - cleaned_html=cleaned_html, - markdown=markdown, - media=media, - links=links, - metadata=metadata, - screenshot=base64_image, - extracted_content=extracted_content, - success=success, - error_message=error_message, - ) - - def fetch_pages( - self, - url_models: List[UrlModel], - provider: str = DEFAULT_PROVIDER, - api_token: str = None, - extract_blocks_flag: bool = True, - word_count_threshold=MIN_WORD_THRESHOLD, - use_cached_html: bool = False, - css_selector: str = None, - screenshot: bool = False, - extraction_strategy: ExtractionStrategy = None, - chunking_strategy: ChunkingStrategy = RegexChunking(), - **kwargs, - ) -> List[CrawlResult]: - extraction_strategy = extraction_strategy or NoExtractionStrategy() - def fetch_page_wrapper(url_model, *args, **kwargs): - return self.fetch_page(url_model, *args, **kwargs) - - with ThreadPoolExecutor() as executor: - results = list( - executor.map( - fetch_page_wrapper, - url_models, - [provider] * len(url_models), - [api_token] * len(url_models), - [extract_blocks_flag] * len(url_models), - [word_count_threshold] * len(url_models), - [css_selector] * len(url_models), - [screenshot] * len(url_models), - [use_cached_html] * len(url_models), - [extraction_strategy] * len(url_models), - [chunking_strategy] * len(url_models), - *[kwargs] * len(url_models), - ) - ) - - return results - - def run( - self, - url: str, - word_count_threshold=MIN_WORD_THRESHOLD, - extraction_strategy: ExtractionStrategy = None, - chunking_strategy: ChunkingStrategy = RegexChunking(), - bypass_cache: bool = False, - css_selector: str = None, - screenshot: bool = False, - user_agent: str = None, - verbose=True, - **kwargs, - ) -> CrawlResult: - extraction_strategy = extraction_strategy or NoExtractionStrategy() - extraction_strategy.verbose = verbose - if not isinstance(extraction_strategy, ExtractionStrategy): - raise ValueError("Unsupported extraction strategy") - if not isinstance(chunking_strategy, ChunkingStrategy): - raise ValueError("Unsupported chunking strategy") - - if word_count_threshold < MIN_WORD_THRESHOLD: - word_count_threshold = MIN_WORD_THRESHOLD - - # Check cache first - cached = None - extracted_content = None - if not bypass_cache and not self.always_by_pass_cache: - cached = get_cached_url(url) - - if cached: - html = cached[1] - extracted_content = cached[2] - if screenshot: - screenshot = cached[9] - - else: - if user_agent: - self.crawler_strategy.update_user_agent(user_agent) - html = self.crawler_strategy.crawl(url) - if screenshot: - screenshot = self.crawler_strategy.take_screenshot() - - return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, bool(cached), **kwargs) - - def process_html( - self, - url: str, - html: str, - extracted_content: str, - word_count_threshold: int, - extraction_strategy: ExtractionStrategy, - chunking_strategy: ChunkingStrategy, - css_selector: str, - screenshot: bool, - verbose: bool, - is_cached: bool, - **kwargs, - ) -> CrawlResult: - t = time.time() - # Extract content from HTML - try: - result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector) - metadata = extract_metadata(html) - if result is None: - raise ValueError(f"Failed to extract content from the website: {url}") - except InvalidCSSSelectorError as e: - raise ValueError(str(e)) - - cleaned_html = result.get("cleaned_html", "") - markdown = result.get("markdown", "") - media = result.get("media", []) - links = result.get("links", []) - - if verbose: - print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds") - - if extracted_content is None: - if verbose: - print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}") - - sections = chunking_strategy.chunk(markdown) - extracted_content = extraction_strategy.run(url, sections) - extracted_content = json.dumps(extracted_content) - - if verbose: - print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.") - - screenshot = None if not screenshot else screenshot - - if not is_cached: - cache_url( - url, - html, - cleaned_html, - markdown, - extracted_content, - True, - json.dumps(media), - json.dumps(links), - json.dumps(metadata), - screenshot=screenshot, - ) - - return CrawlResult( - url=url, - html=html, - cleaned_html=cleaned_html, - markdown=markdown, - media=media, - links=links, - metadata=metadata, - screenshot=screenshot, - extracted_content=extracted_content, - success=True, - error_message="", - ) \ No newline at end of file From f9a297e08deac1963a302f8e70d0fe284564ceca Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 8 Nov 2024 19:39:05 +0800 Subject: [PATCH 011/115] Add Docker example script for testing Crawl4AI functionality --- docs/examples/docker_example.py | 300 ++++++++++++++++++++++++++++++++ 1 file changed, 300 insertions(+) create mode 100644 docs/examples/docker_example.py diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py new file mode 100644 index 00000000..c22acd55 --- /dev/null +++ b/docs/examples/docker_example.py @@ -0,0 +1,300 @@ +import requests +import json +import time +import sys +import base64 +import os +from typing import Dict, Any + +class Crawl4AiTester: + def __init__(self, base_url: str = "http://localhost:11235"): + self.base_url = base_url + + def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]: + # Submit crawl job + response = requests.post(f"{self.base_url}/crawl", json=request_data) + task_id = response.json()["task_id"] + print(f"Task ID: {task_id}") + + # Poll for result + start_time = time.time() + while True: + if time.time() - start_time > timeout: + raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds") + + result = requests.get(f"{self.base_url}/task/{task_id}") + status = result.json() + + if status["status"] == "failed": + print("Task failed:", status.get("error")) + raise Exception(f"Task failed: {status.get('error')}") + + if status["status"] == "completed": + return status + + time.sleep(2) + +def test_docker_deployment(version="basic"): + tester = Crawl4AiTester() + print(f"Testing Crawl4AI Docker {version} version") + + # Health check with timeout and retry + max_retries = 5 + for i in range(max_retries): + try: + health = requests.get(f"{tester.base_url}/health", timeout=10) + print("Health check:", health.json()) + break + except requests.exceptions.RequestException as e: + if i == max_retries - 1: + print(f"Failed to connect after {max_retries} attempts") + sys.exit(1) + print(f"Waiting for service to start (attempt {i+1}/{max_retries})...") + time.sleep(5) + + # Test cases based on version + test_basic_crawl(tester) + + # if version in ["full", "transformer"]: + # test_cosine_extraction(tester) + + # test_js_execution(tester) + # test_css_selector(tester) + # test_structured_extraction(tester) + # test_llm_extraction(tester) + # test_llm_with_ollama(tester) + # test_screenshot(tester) + + +def test_basic_crawl(tester: Crawl4AiTester): + print("\n=== Testing Basic Crawl ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 10 + } + + result = tester.submit_and_wait(request) + print(f"Basic crawl result length: {len(result['result']['markdown'])}") + assert result["result"]["success"] + assert len(result["result"]["markdown"]) > 0 + +def test_js_execution(tester: Crawl4AiTester): + print("\n=== Testing JS Execution ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 8, + "js_code": [ + "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" + ], + "wait_for": "article.tease-card:nth-child(10)", + "crawler_params": { + "headless": True + } + } + + result = tester.submit_and_wait(request) + print(f"JS execution result length: {len(result['result']['markdown'])}") + assert result["result"]["success"] + +def test_css_selector(tester: Crawl4AiTester): + print("\n=== Testing CSS Selector ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 7, + "css_selector": ".wide-tease-item__description", + "crawler_params": { + "headless": True + }, + "extra": {"word_count_threshold": 10} + + } + + result = tester.submit_and_wait(request) + print(f"CSS selector result length: {len(result['result']['markdown'])}") + assert result["result"]["success"] + +def test_structured_extraction(tester: Crawl4AiTester): + print("\n=== Testing Structured Extraction ===") + schema = { + "name": "Coinbase Crypto Prices", + "baseSelector": ".cds-tableRow-t45thuk", + "fields": [ + { + "name": "crypto", + "selector": "td:nth-child(1) h2", + "type": "text", + }, + { + "name": "symbol", + "selector": "td:nth-child(1) p", + "type": "text", + }, + { + "name": "price", + "selector": "td:nth-child(2)", + "type": "text", + } + ], + } + + request = { + "urls": "https://www.coinbase.com/explore", + "priority": 9, + "extraction_config": { + "type": "json_css", + "params": { + "schema": schema + } + } + } + + result = tester.submit_and_wait(request) + extracted = json.loads(result["result"]["extracted_content"]) + print(f"Extracted {len(extracted)} items") + print("Sample item:", json.dumps(extracted[0], indent=2)) + assert result["result"]["success"] + assert len(extracted) > 0 + +def test_llm_extraction(tester: Crawl4AiTester): + print("\n=== Testing LLM Extraction ===") + schema = { + "type": "object", + "properties": { + "model_name": { + "type": "string", + "description": "Name of the OpenAI model." + }, + "input_fee": { + "type": "string", + "description": "Fee for input token for the OpenAI model." + }, + "output_fee": { + "type": "string", + "description": "Fee for output token for the OpenAI model." + } + }, + "required": ["model_name", "input_fee", "output_fee"] + } + + request = { + "urls": "https://openai.com/api/pricing", + "priority": 8, + "extraction_config": { + "type": "llm", + "params": { + "provider": "openai/gpt-4o-mini", + "api_token": os.getenv("OPENAI_API_KEY"), + "schema": schema, + "extraction_type": "schema", + "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens.""" + } + }, + "crawler_params": {"word_count_threshold": 1} + } + + try: + result = tester.submit_and_wait(request) + extracted = json.loads(result["result"]["extracted_content"]) + print(f"Extracted {len(extracted)} model pricing entries") + print("Sample entry:", json.dumps(extracted[0], indent=2)) + assert result["result"]["success"] + except Exception as e: + print(f"LLM extraction test failed (might be due to missing API key): {str(e)}") + +def test_llm_with_ollama(tester: Crawl4AiTester): + print("\n=== Testing LLM with Ollama ===") + schema = { + "type": "object", + "properties": { + "article_title": { + "type": "string", + "description": "The main title of the news article" + }, + "summary": { + "type": "string", + "description": "A brief summary of the article content" + }, + "main_topics": { + "type": "array", + "items": {"type": "string"}, + "description": "Main topics or themes discussed in the article" + } + } + } + + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 8, + "extraction_config": { + "type": "llm", + "params": { + "provider": "ollama/llama2", + "schema": schema, + "extraction_type": "schema", + "instruction": "Extract the main article information including title, summary, and main topics." + } + }, + "extra": {"word_count_threshold": 1}, + "crawler_params": {"verbose": True} + } + + try: + result = tester.submit_and_wait(request) + extracted = json.loads(result["result"]["extracted_content"]) + print("Extracted content:", json.dumps(extracted, indent=2)) + assert result["result"]["success"] + except Exception as e: + print(f"Ollama extraction test failed: {str(e)}") + +def test_cosine_extraction(tester: Crawl4AiTester): + print("\n=== Testing Cosine Extraction ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 8, + "extraction_config": { + "type": "cosine", + "params": { + "semantic_filter": "business finance economy", + "word_count_threshold": 10, + "max_dist": 0.2, + "top_k": 3 + } + } + } + + try: + result = tester.submit_and_wait(request) + extracted = json.loads(result["result"]["extracted_content"]) + print(f"Extracted {len(extracted)} text clusters") + print("First cluster tags:", extracted[0]["tags"]) + assert result["result"]["success"] + except Exception as e: + print(f"Cosine extraction test failed: {str(e)}") + +def test_screenshot(tester: Crawl4AiTester): + print("\n=== Testing Screenshot ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 5, + "screenshot": True, + "crawler_params": { + "headless": True + } + } + + result = tester.submit_and_wait(request) + print("Screenshot captured:", bool(result["result"]["screenshot"])) + + if result["result"]["screenshot"]: + # Save screenshot + screenshot_data = base64.b64decode(result["result"]["screenshot"]) + with open("test_screenshot.jpg", "wb") as f: + f.write(screenshot_data) + print("Screenshot saved as test_screenshot.jpg") + + assert result["result"]["success"] + +if __name__ == "__main__": + version = sys.argv[1] if len(sys.argv) > 1 else "basic" + # version = "full" + test_docker_deployment(version) \ No newline at end of file From a098483cbbb26be3d3b6f8d678f8409d6b007789 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 9 Nov 2024 20:40:30 +0800 Subject: [PATCH 012/115] Update Roadmap --- README.md | 24 +++ ROADMAP.md | 503 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 527 insertions(+) create mode 100644 ROADMAP.md diff --git a/README.md b/README.md index 1f36aca6..e1a64aa1 100644 --- a/README.md +++ b/README.md @@ -432,6 +432,30 @@ You can find the full comparison code in our repository at `docs/examples/crawl4 For detailed documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/). +## Crawl4AI Roadmap 🗺️ + +For detailed information on our development plans and upcoming features, check out our [Roadmap](https://github.com/unclecode/crawl4ai/blob/main/ROADMAP.md). + +### Advanced Crawling Systems 🔧 +- [x] 0. Graph Crawler: Smart website traversal using graph search algorithms for comprehensive nested page extraction +- [ ] 1. Question-Based Crawler: Natural language driven web discovery and content extraction +- [ ] 2. Knowledge-Optimal Crawler: Smart crawling that maximizes knowledge while minimizing data extraction +- [ ] 3. Agentic Crawler: Autonomous system for complex multi-step crawling operations + +### Specialized Features 🛠️ +- [ ] 4. Automated Schema Generator: Convert natural language to extraction schemas +- [ ] 5. Domain-Specific Scrapers: Pre-configured extractors for common platforms (academic, e-commerce) +- [ ] 6. Web Embedding Index: Semantic search infrastructure for crawled content + +### Development Tools 🔨 +- [ ] 7. Interactive Playground: Web UI for testing, comparing strategies with AI assistance +- [ ] 8. Performance Monitor: Real-time insights into crawler operations +- [ ] 9. Cloud Integration: One-click deployment solutions across cloud providers + +### Community & Growth 🌱 +- [ ] 10. Sponsorship Program: Structured support system with tiered benefits +- [ ] 11. Educational Content: "How to Crawl" video series and interactive tutorials + ## Contributing 🤝 We welcome contributions from the open-source community. Check out our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTING.md) for more information. diff --git a/ROADMAP.md b/ROADMAP.md new file mode 100644 index 00000000..0fd784c1 --- /dev/null +++ b/ROADMAP.md @@ -0,0 +1,503 @@ +# Crawl4AI Strategic Roadmap + +```mermaid +%%{init: {'themeVariables': { 'fontSize': '14px'}}}%% +graph TD + subgraph A1[Advanced Crawling Systems 🔧] + A["` + • Graph Crawler ✓ + • Question-Based Crawler + • Knowledge-Optimal Crawler + • Agentic Crawler + `"] + end + + subgraph A2[Specialized Features 🛠️] + B["` + • Automated Schema Generator + • Domain-Specific Scrapers + • + • + `"] + end + + subgraph A3[Development Tools 🔨] + C["` + • Interactive Playground + • Performance Monitor + • Cloud Integration + • + `"] + end + + subgraph A4[Community & Growth 🌱] + D["` + • Sponsorship Program + • Educational Content + • + • + `"] + end + + classDef default fill:#f9f9f9,stroke:#333,stroke-width:2px + classDef section fill:#f0f0f0,stroke:#333,stroke-width:4px,rx:10 + class A1,A2,A3,A4 section + + %% Layout hints + A1 --> A2[" "] + A3 --> A4[" "] + linkStyle 0,1 stroke:none +``` + +Crawl4AI is evolving to provide more intelligent, efficient, and versatile web crawling capabilities. This roadmap outlines the key developments and features planned for the project, organized into strategic sections that build upon our current foundation. + +## 1. Advanced Crawling Systems 🔧 + +This section introduces three powerful crawling systems that extend Crawl4AI's capabilities from basic web crawling to intelligent, purpose-driven data extraction. + +### 1.1 Question-Based Crawler +The Question-Based Crawler enhances our core engine by enabling automatic discovery and extraction of relevant web content based on natural language questions. + +Key Features: +- SerpiAPI integration for intelligent web search +- Relevancy scoring for search results +- Automatic URL discovery and prioritization +- Cross-source validation + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.discovery import QuestionBasedDiscovery + +async with AsyncWebCrawler() as crawler: + discovery = QuestionBasedDiscovery(crawler) + results = await discovery.arun( + question="What are the system requirements for major cloud providers' GPU instances?", + max_urls=5, + relevance_threshold=0.7 + ) + + for result in results: + print(f"Source: {result.url} (Relevance: {result.relevance_score})") + print(f"Content: {result.markdown}\n") +``` + +### 1.2 Knowledge-Optimal Crawler +An intelligent crawling system that solves the optimization problem of minimizing data extraction while maximizing knowledge acquisition for specific objectives. + +Key Features: +- Smart content prioritization +- Minimal data extraction for maximum knowledge +- Probabilistic relevance assessment +- Objective-driven crawling paths + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.optimization import KnowledgeOptimizer + +async with AsyncWebCrawler() as crawler: + optimizer = KnowledgeOptimizer( + objective="Understand GPU instance pricing and limitations across cloud providers", + required_knowledge=[ + "pricing structure", + "GPU specifications", + "usage limits", + "availability zones" + ], + confidence_threshold=0.85 + ) + + result = await crawler.arun( + urls=[ + "https://aws.amazon.com/ec2/pricing/", + "https://cloud.google.com/gpu", + "https://azure.microsoft.com/pricing/" + ], + optimizer=optimizer, + optimization_mode="minimal_extraction" + ) + + print(f"Knowledge Coverage: {result.knowledge_coverage}") + print(f"Data Efficiency: {result.efficiency_ratio}") + print(f"Extracted Content: {result.optimal_content}") +``` + +### 1.3 Agentic Crawler +An autonomous system capable of understanding complex goals and automatically planning and executing multi-step crawling operations. + +Key Features: +- Autonomous goal interpretation +- Dynamic step planning +- Interactive navigation capabilities +- Visual recognition and interaction +- Automatic error recovery + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.agents import CrawlerAgent + +async with AsyncWebCrawler() as crawler: + agent = CrawlerAgent(crawler) + + # Automatic planning and execution + result = await agent.arun( + goal="Find research papers about quantum computing published in 2023 with more than 50 citations", + auto_retry=True + ) + print("Generated Plan:", result.executed_steps) + print("Extracted Data:", result.data) + + # Using custom steps with automatic execution + result = await agent.arun( + goal="Extract conference deadlines from ML conferences", + custom_plan=[ + "Navigate to conference page", + "Find important dates section", + "Extract submission deadlines", + "Verify dates are for 2024" + ] + ) + + # Monitoring execution + print("Step Completion:", result.step_status) + print("Execution Time:", result.execution_time) + print("Success Rate:", result.success_rate) +``` + +# Section 2: Specialized Features 🛠️ + +This section introduces specialized tools and features that enhance Crawl4AI's capabilities for specific use cases and data extraction needs. + +### 2.1 Automated Schema Generator +A system that automatically generates JsonCssExtractionStrategy schemas from natural language descriptions, making structured data extraction accessible to all users. + +Key Features: +- Natural language schema generation +- Automatic pattern detection +- Predefined schema templates +- Chrome extension for visual schema building + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.schema import SchemaGenerator + +# Generate schema from natural language description +generator = SchemaGenerator() +schema = await generator.generate( + url="https://news-website.com", + description="For each news article on the page, I need the headline, publication date, and main image" +) + +# Use generated schema with crawler +async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://news-website.com", + extraction_strategy=schema + ) + +# Example of generated schema: +""" +{ + "name": "News Article Extractor", + "baseSelector": "article.news-item", + "fields": [ + { + "name": "headline", + "selector": "h2.article-title", + "type": "text" + }, + { + "name": "date", + "selector": "span.publish-date", + "type": "text" + }, + { + "name": "image", + "selector": "img.article-image", + "type": "attribute", + "attribute": "src" + } + ] +} +""" +``` + +### 2.2 Domain Specific Scrapers +Specialized extraction strategies optimized for common website types and platforms, providing consistent and reliable data extraction without additional configuration. + +Key Features: +- Pre-configured extractors for popular platforms +- Academic site specialization (arXiv, NCBI) +- E-commerce standardization +- Documentation site handling + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.extractors import AcademicExtractor, EcommerceExtractor + +async with AsyncWebCrawler() as crawler: + # Academic paper extraction + papers = await crawler.arun( + url="https://arxiv.org/list/cs.AI/recent", + extractor="academic", # Built-in extractor type + site_type="arxiv", # Specific site optimization + extract_fields=[ + "title", + "authors", + "abstract", + "citations" + ] + ) + + # E-commerce product data + products = await crawler.arun( + url="https://store.example.com/products", + extractor="ecommerce", + extract_fields=[ + "name", + "price", + "availability", + "reviews" + ] + ) +``` + +### 2.3 Web Embedding Index +Creates and maintains a semantic search infrastructure for crawled content, enabling efficient retrieval and querying of web content through vector embeddings. + +Key Features: +- Automatic embedding generation +- Intelligent content chunking +- Efficient vector storage and indexing +- Semantic search capabilities + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.indexing import WebIndex + +# Initialize and build index +index = WebIndex(model="efficient-mini") + +async with AsyncWebCrawler() as crawler: + # Crawl and index content + await index.build( + urls=["https://docs.example.com"], + crawler=crawler, + options={ + "chunk_method": "semantic", + "update_policy": "incremental", + "embedding_batch_size": 100 + } + ) + + # Search through indexed content + results = await index.search( + query="How to implement OAuth authentication?", + filters={ + "content_type": "technical", + "recency": "6months" + }, + top_k=5 + ) + + # Get similar content + similar = await index.find_similar( + url="https://docs.example.com/auth/oauth", + threshold=0.85 + ) +``` + +Each of these specialized features builds upon Crawl4AI's core functionality while providing targeted solutions for specific use cases. They can be used independently or combined for more complex data extraction and processing needs. + +# Section 3: Development Tools 🔧 + +This section covers tools designed to enhance the development experience, monitoring, and deployment of Crawl4AI applications. + +### 3.1 Crawl4AI Playground 🎮 + +The Crawl4AI Playground is an interactive web-based development environment that simplifies web scraping experimentation, development, and deployment. With its intuitive interface and AI-powered assistance, users can quickly prototype, test, and deploy web scraping solutions. + +#### Key Features 🌟 + +##### Visual Strategy Builder +- Interactive point-and-click interface for building extraction strategies +- Real-time preview of selected elements +- Side-by-side comparison of different extraction approaches +- Visual validation of CSS selectors and XPath queries + +##### AI Assistant Integration +- Strategy recommendations based on target website analysis +- Parameter optimization suggestions +- Best practices guidance for specific use cases +- Automated error detection and resolution +- Performance optimization tips + +##### Real-Time Testing & Validation +- Live preview of extraction results +- Side-by-side comparison of multiple strategies +- Performance metrics visualization +- Automatic validation of extracted data +- Error detection and debugging tools + +##### Project Management +- Save and organize multiple scraping projects +- Version control for configurations +- Export/import project settings +- Share configurations with team members +- Project templates for common use cases + +##### Deployment Pipeline +- One-click deployment to various environments +- Docker container generation +- Cloud deployment templates (AWS, GCP, Azure) +- Scaling configuration management +- Monitoring setup automation + + +### 3.2 Performance Monitoring System +A comprehensive monitoring solution providing real-time insights into crawler operations, resource usage, and system health through both CLI and GUI interfaces. + +Key Features: +- Real-time resource tracking +- Active crawl monitoring +- Performance statistics +- Customizable alerting system + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.monitor import CrawlMonitor + +# Initialize monitoring +monitor = CrawlMonitor() + +# Start monitoring with CLI interface +await monitor.start( + mode="cli", # or "gui" + refresh_rate="1s", + metrics={ + "resources": ["cpu", "memory", "network"], + "crawls": ["active", "queued", "completed"], + "performance": ["success_rate", "response_times"] + } +) + +# Example CLI output: +""" +Crawl4AI Monitor (Live) - Press Q to exit +──────────────────────────────────────── +System Usage: + ├─ CPU: ███████░░░ 70% + └─ Memory: ████░░░░░ 2.1GB/8GB + +Active Crawls: +ID URL Status Progress +001 docs.example.com 🟢 Active 75% +002 api.service.com 🟡 Queue - + +Metrics (Last 5min): + ├─ Success Rate: 98% + ├─ Avg Response: 0.6s + └─ Pages/sec: 8.5 +""" +``` + +### 3.3 Cloud Integration +Streamlined deployment tools for setting up Crawl4AI in various cloud environments, with support for scaling and monitoring. + +Key Features: +- One-click deployment solutions +- Auto-scaling configuration +- Load balancing setup +- Cloud-specific optimizations +- Monitoring integration + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.deploy import CloudDeployer + +# Initialize deployer +deployer = CloudDeployer() + +# Deploy crawler service +deployment = await deployer.deploy( + service_name="crawler-cluster", + platform="aws", # or "gcp", "azure" + config={ + "instance_type": "compute-optimized", + "auto_scaling": { + "min_instances": 2, + "max_instances": 10, + "scale_based_on": "cpu_usage" + }, + "region": "us-east-1", + "monitoring": True + } +) + +# Get deployment status and endpoints +print(f"Service Status: {deployment.status}") +print(f"API Endpoint: {deployment.endpoint}") +print(f"Monitor URL: {deployment.monitor_url}") +``` + +These development tools work together to provide a comprehensive environment for developing, testing, monitoring, and deploying Crawl4AI applications. The Playground helps users experiment and generate optimal configurations, the Performance Monitor ensures smooth operation, and the Cloud Integration tools simplify deployment and scaling. + +# Section 4: Community & Growth 🌱 + +This section outlines initiatives designed to build and support the Crawl4AI community, provide educational resources, and ensure sustainable project growth. + +### 4.1 Sponsorship Program +A structured program to support ongoing development and maintenance of Crawl4AI while providing valuable benefits to sponsors. + +Key Features: +- Multiple sponsorship tiers +- Sponsor recognition system +- Priority support for sponsors +- Early access to new features +- Custom feature development opportunities + +Program Structure (not yet finalized): +``` +Sponsorship Tiers: + +🥉 Bronze Supporter +- GitHub Sponsor badge +- Priority issue response +- Community Discord role + +🥈 Silver Supporter +- All Bronze benefits +- Technical support channel +- Vote on roadmap priorities +- Early access to beta features + +🥇 Gold Supporter +- All Silver benefits +- Custom feature requests +- Direct developer access +- Private support sessions + +💎 Diamond Partner +- All Gold benefits +- Custom development +- On-demand consulting +- Integration support +``` + +### 4.2 "How to Crawl" Video Series +A comprehensive educational resource teaching users how to effectively use Crawl4AI for various web scraping and data extraction scenarios. + +Key Features: +- Step-by-step tutorials +- Real-world use cases +- Best practices +- Integration guides +- Advanced feature deep-dives + +These community initiatives are designed to: +- Provide comprehensive learning resources +- Foster a supportive user community +- Ensure sustainable project development +- Share knowledge and best practices +- Create opportunities for collaboration + +The combination of structured support through sponsorship, educational content through video series, and interactive learning through the playground creates a robust ecosystem for both new and experienced users of Crawl4AI. From b6d6631b125bde49b402ba30ae22fc3fb4661228 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 12 Nov 2024 12:10:58 +0800 Subject: [PATCH 013/115] Enhance Async Crawler with Playwright support - Implemented new async crawler strategy using Playwright. - Introduced ManagedBrowser for better browser management. - Added support for persistent browser sessions and improved error handling. - Updated version from 0.3.73 to 0.3.731. - Enhanced logic in main.py for conditional mounting of static files. - Updated requirements to replace playwright_stealth with tf-playwright-stealth. --- crawl4ai/_version.py | 2 +- crawl4ai/async_crawler_strategy.py | 106 ++- crawl4ai/async_crawler_strategy_0.3.73.py | 965 ++++++++++++++++++++++ main.py | 12 +- requirements.txt | 2 +- 5 files changed, 1057 insertions(+), 30 deletions(-) create mode 100644 crawl4ai/async_crawler_strategy_0.3.73.py diff --git a/crawl4ai/_version.py b/crawl4ai/_version.py index 85030f0e..7ab71c9b 100644 --- a/crawl4ai/_version.py +++ b/crawl4ai/_version.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.73" \ No newline at end of file +__version__ = "0.3.731" \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index fa50e7b5..896a0644 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -186,6 +186,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.sleep_on_close = kwargs.get("sleep_on_close", False) self.use_managed_browser = kwargs.get("use_managed_browser", False) self.user_data_dir = kwargs.get("user_data_dir", None) + self.use_persistent_context = kwargs.get("use_persistent_context", False) + self.chrome_channel = kwargs.get("chrome_channel", "chrome") self.managed_browser = None self.default_context = None self.hooks = { @@ -197,6 +199,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): 'before_return_html': None, 'before_retrieve_html': None } + self.extra_args = kwargs.get("extra_args", []) async def __aenter__(self): await self.start() @@ -238,36 +241,71 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "User-Agent": self.user_agent }) else: + # Base browser arguments browser_args = { "headless": self.headless, "args": [ - "--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage", - "--disable-blink-features=AutomationControlled", + "--no-first-run", + "--no-default-browser-check", "--disable-infobars", "--window-position=0,0", "--ignore-certificate-errors", "--ignore-certificate-errors-spki-list", - # "--headless=new", # Use the new headless mode ] } + + # Add channel if specified (try Chrome first) + if self.chrome_channel: + browser_args["channel"] = self.chrome_channel + + # Add extra args if provided + if self.extra_args: + browser_args["args"].extend(self.extra_args) # Add proxy settings if a proxy is specified if self.proxy: proxy_settings = ProxySettings(server=self.proxy) browser_args["proxy"] = proxy_settings elif self.proxy_config: - proxy_settings = ProxySettings(server=self.proxy_config.get("server"), username=self.proxy_config.get("username"), password=self.proxy_config.get("password")) + proxy_settings = ProxySettings( + server=self.proxy_config.get("server"), + username=self.proxy_config.get("username"), + password=self.proxy_config.get("password") + ) browser_args["proxy"] = proxy_settings - # Select the appropriate browser based on the browser_type - if self.browser_type == "firefox": - self.browser = await self.playwright.firefox.launch(**browser_args) - elif self.browser_type == "webkit": - self.browser = await self.playwright.webkit.launch(**browser_args) - else: - self.browser = await self.playwright.chromium.launch(**browser_args) + try: + # Select the appropriate browser based on the browser_type + if self.browser_type == "firefox": + self.browser = await self.playwright.firefox.launch(**browser_args) + elif self.browser_type == "webkit": + self.browser = await self.playwright.webkit.launch(**browser_args) + else: + if self.use_persistent_context and self.user_data_dir: + self.browser = await self.playwright.chromium.launch_persistent_context( + user_data_dir=self.user_data_dir, + **browser_args + ) + self.default_context = self.browser + else: + self.browser = await self.playwright.chromium.launch(**browser_args) + + except Exception as e: + # Fallback to chromium if Chrome channel fails + if "chrome" in str(e) and browser_args.get("channel") == "chrome": + browser_args["channel"] = "chromium" + if self.use_persistent_context and self.user_data_dir: + self.browser = await self.playwright.chromium.launch_persistent_context( + user_data_dir=self.user_data_dir, + **browser_args + ) + self.default_context = self.browser + else: + self.browser = await self.playwright.chromium.launch(**browser_args) + else: + raise await self.execute_hook('on_browser_created', self.browser) @@ -461,24 +499,35 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if session_id: context, page, _ = self.sessions.get(session_id, (None, None, None)) if not context: + if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: + # In persistent context, browser is the context + context = self.browser + page = await context.new_page() + else: + # Normal context creation for non-persistent or non-Chrome browsers + context = await self.browser.new_context( + user_agent=self.user_agent, + viewport={"width": 1920, "height": 1080}, + proxy={"server": self.proxy} if self.proxy else None, + accept_downloads=True, + java_script_enabled=True + ) + await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) + await context.set_extra_http_headers(self.headers) + page = await context.new_page() + self.sessions[session_id] = (context, page, time.time()) + else: + if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: + # In persistent context, browser is the context + context = self.browser + else: + # Normal context creation context = await self.browser.new_context( user_agent=self.user_agent, viewport={"width": 1920, "height": 1080}, - proxy={"server": self.proxy} if self.proxy else None, - accept_downloads=True, - java_script_enabled=True + proxy={"server": self.proxy} if self.proxy else None ) - await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) await context.set_extra_http_headers(self.headers) - page = await context.new_page() - self.sessions[session_id] = (context, page, time.time()) - else: - context = await self.browser.new_context( - user_agent=self.user_agent, - viewport={"width": 1920, "height": 1080}, - proxy={"server": self.proxy} if self.proxy else None - ) - await context.set_extra_http_headers(self.headers) if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False): # Inject scripts to override navigator properties @@ -512,7 +561,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """) page = await context.new_page() - # await stealth_async(page) #, stealth_config) + if kwargs.get("magic", False): + await stealth_async(page, stealth_config) # Add console message and error logging if kwargs.get("log_console", False): @@ -544,8 +594,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if not kwargs.get("js_only", False): await self.execute_hook('before_goto', page) + response = await page.goto( - url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000) + url, + # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), + wait_until=kwargs.get("wait_until", "domcontentloaded"), + timeout=kwargs.get("page_timeout", 60000) ) # response = await page.goto("about:blank") diff --git a/crawl4ai/async_crawler_strategy_0.3.73.py b/crawl4ai/async_crawler_strategy_0.3.73.py new file mode 100644 index 00000000..54835dad --- /dev/null +++ b/crawl4ai/async_crawler_strategy_0.3.73.py @@ -0,0 +1,965 @@ +import asyncio +import base64 +import time +from abc import ABC, abstractmethod +from typing import Callable, Dict, Any, List, Optional, Awaitable +import os, sys, shutil +import tempfile, subprocess +from playwright.async_api import async_playwright, Page, Browser, Error +from io import BytesIO +from PIL import Image, ImageDraw, ImageFont +from pathlib import Path +from playwright.async_api import ProxySettings +from pydantic import BaseModel +import hashlib +import json +import uuid + +from playwright_stealth import StealthConfig, stealth_async + +stealth_config = StealthConfig( + webdriver=True, + chrome_app=True, + chrome_csi=True, + chrome_load_times=True, + chrome_runtime=True, + navigator_languages=True, + navigator_plugins=True, + navigator_permissions=True, + webgl_vendor=True, + outerdimensions=True, + navigator_hardware_concurrency=True, + media_codecs=True, +) + + +class ManagedBrowser: + def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False): + self.browser_type = browser_type + self.user_data_dir = user_data_dir + self.headless = headless + self.browser_process = None + self.temp_dir = None + self.debugging_port = 9222 + + async def start(self) -> str: + """ + Starts the browser process and returns the CDP endpoint URL. + If user_data_dir is not provided, creates a temporary directory. + """ + + # Create temp dir if needed + if not self.user_data_dir: + self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-") + self.user_data_dir = self.temp_dir + + # Get browser path and args based on OS and browser type + browser_path = self._get_browser_path() + args = self._get_browser_args() + + # Start browser process + try: + self.browser_process = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + await asyncio.sleep(2) # Give browser time to start + return f"http://localhost:{self.debugging_port}" + except Exception as e: + await self.cleanup() + raise Exception(f"Failed to start browser: {e}") + + def _get_browser_path(self) -> str: + """Returns the browser executable path based on OS and browser type""" + if sys.platform == "darwin": # macOS + paths = { + "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox", + "webkit": "/Applications/Safari.app/Contents/MacOS/Safari" + } + elif sys.platform == "win32": # Windows + paths = { + "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe", + "webkit": None # WebKit not supported on Windows + } + else: # Linux + paths = { + "chromium": "google-chrome", + "firefox": "firefox", + "webkit": None # WebKit not supported on Linux + } + + return paths.get(self.browser_type) + + def _get_browser_args(self) -> List[str]: + """Returns browser-specific command line arguments""" + base_args = [self._get_browser_path()] + + if self.browser_type == "chromium": + args = [ + f"--remote-debugging-port={self.debugging_port}", + f"--user-data-dir={self.user_data_dir}", + ] + if self.headless: + args.append("--headless=new") + elif self.browser_type == "firefox": + args = [ + "--remote-debugging-port", str(self.debugging_port), + "--profile", self.user_data_dir, + ] + if self.headless: + args.append("--headless") + else: + raise NotImplementedError(f"Browser type {self.browser_type} not supported") + + return base_args + args + + async def cleanup(self): + """Cleanup browser process and temporary directory""" + if self.browser_process: + try: + self.browser_process.terminate() + await asyncio.sleep(1) + if self.browser_process.poll() is None: + self.browser_process.kill() + except Exception as e: + print(f"Error terminating browser: {e}") + + if self.temp_dir and os.path.exists(self.temp_dir): + try: + shutil.rmtree(self.temp_dir) + except Exception as e: + print(f"Error removing temporary directory: {e}") + +class AsyncCrawlResponse(BaseModel): + html: str + response_headers: Dict[str, str] + status_code: int + screenshot: Optional[str] = None + get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None + + class Config: + arbitrary_types_allowed = True + +class AsyncCrawlerStrategy(ABC): + @abstractmethod + async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: + pass + + @abstractmethod + async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: + pass + + @abstractmethod + async def take_screenshot(self, **kwargs) -> str: + pass + + @abstractmethod + def update_user_agent(self, user_agent: str): + pass + + @abstractmethod + def set_hook(self, hook_type: str, hook: Callable): + pass + +class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + def __init__(self, use_cached_html=False, js_code=None, **kwargs): + self.use_cached_html = use_cached_html + self.user_agent = kwargs.get( + "user_agent", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + ) + self.proxy = kwargs.get("proxy") + self.proxy_config = kwargs.get("proxy_config") + self.headless = kwargs.get("headless", True) + self.browser_type = kwargs.get("browser_type", "chromium") + self.headers = kwargs.get("headers", {}) + self.sessions = {} + self.session_ttl = 1800 + self.js_code = js_code + self.verbose = kwargs.get("verbose", False) + self.playwright = None + self.browser = None + self.sleep_on_close = kwargs.get("sleep_on_close", False) + self.use_managed_browser = kwargs.get("use_managed_browser", False) + self.user_data_dir = kwargs.get("user_data_dir", None) + self.use_persistent_context = kwargs.get("use_persistent_context", False) + self.chrome_channel = kwargs.get("chrome_channel", "chrome") + self.managed_browser = None + self.default_context = None + self.hooks = { + 'on_browser_created': None, + 'on_user_agent_updated': None, + 'on_execution_started': None, + 'before_goto': None, + 'after_goto': None, + 'before_return_html': None, + 'before_retrieve_html': None + } + self.extra_args = kwargs.get("extra_args", []) + + async def __aenter__(self): + await self.start() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + await self.close() + + async def start(self): + if self.playwright is None: + self.playwright = await async_playwright().start() + if self.browser is None: + if self.use_managed_browser: + # Use managed browser approach + self.managed_browser = ManagedBrowser( + browser_type=self.browser_type, + user_data_dir=self.user_data_dir, + headless=self.headless + ) + cdp_url = await self.managed_browser.start() + self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) + + # Get the default context that maintains the user profile + contexts = self.browser.contexts + if contexts: + self.default_context = contexts[0] + else: + # If no default context exists, create one + self.default_context = await self.browser.new_context( + viewport={"width": 1920, "height": 1080} + ) + + # Set up the default context + if self.default_context: + await self.default_context.set_extra_http_headers(self.headers) + + if self.user_agent: + await self.default_context.set_extra_http_headers({ + "User-Agent": self.user_agent + }) + else: + browser_args = { + "headless": self.headless, + "args": [ + "--disable-gpu", + "--no-sandbox", + "--disable-dev-shm-usage", + "--disable-blink-features=AutomationControlled", + "--disable-infobars", + "--window-position=0,0", + "--ignore-certificate-errors", + "--ignore-certificate-errors-spki-list", + # "--disable-http2", + # "--headless=new", # Use the new headless mode + ] + } + + # Add extra args if provided + if self.extra_args: + browser_args["args"].extend(self.extra_args) + + # Add proxy settings if a proxy is specified + if self.proxy: + proxy_settings = ProxySettings(server=self.proxy) + browser_args["proxy"] = proxy_settings + elif self.proxy_config: + proxy_settings = ProxySettings(server=self.proxy_config.get("server"), username=self.proxy_config.get("username"), password=self.proxy_config.get("password")) + browser_args["proxy"] = proxy_settings + + # Select the appropriate browser based on the browser_type + if self.browser_type == "firefox": + self.browser = await self.playwright.firefox.launch(**browser_args) + elif self.browser_type == "webkit": + self.browser = await self.playwright.webkit.launch(**browser_args) + else: + self.browser = await self.playwright.chromium.launch(**browser_args) + + # Update the headless configuration + if self.headless: + # Use the new headless mode explicitly + browser_args["args"].append("--headless=new") + + await self.execute_hook('on_browser_created', self.browser) + + async def close(self): + if self.sleep_on_close: + await asyncio.sleep(0.5) + + # Close all active sessions + session_ids = list(self.sessions.keys()) + for session_id in session_ids: + await self.kill_session(session_id) + + if self.browser: + await self.browser.close() + self.browser = None + + if self.managed_browser: + await self.managed_browser.cleanup() + self.managed_browser = None + + if self.playwright: + await self.playwright.stop() + self.playwright = None + + def __del__(self): + if self.browser or self.playwright: + asyncio.get_event_loop().run_until_complete(self.close()) + + def set_hook(self, hook_type: str, hook: Callable): + if hook_type in self.hooks: + self.hooks[hook_type] = hook + else: + raise ValueError(f"Invalid hook type: {hook_type}") + + async def execute_hook(self, hook_type: str, *args): + hook = self.hooks.get(hook_type) + if hook: + if asyncio.iscoroutinefunction(hook): + return await hook(*args) + else: + return hook(*args) + return args[0] if args else None + + def update_user_agent(self, user_agent: str): + self.user_agent = user_agent + + def set_custom_headers(self, headers: Dict[str, str]): + self.headers = headers + + async def kill_session(self, session_id: str): + if session_id in self.sessions: + context, page, _ = self.sessions[session_id] + await page.close() + if not self.use_managed_browser: + await context.close() + del self.sessions[session_id] + + def _cleanup_expired_sessions(self): + current_time = time.time() + expired_sessions = [ + sid for sid, (_, _, last_used) in self.sessions.items() + if current_time - last_used > self.session_ttl + ] + for sid in expired_sessions: + asyncio.create_task(self.kill_session(sid)) + + async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): + wait_for = wait_for.strip() + + if wait_for.startswith('js:'): + # Explicitly specified JavaScript + js_code = wait_for[3:].strip() + return await self.csp_compliant_wait(page, js_code, timeout) + elif wait_for.startswith('css:'): + # Explicitly specified CSS selector + css_selector = wait_for[4:].strip() + try: + await page.wait_for_selector(css_selector, timeout=timeout) + except Error as e: + if 'Timeout' in str(e): + raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'") + else: + raise ValueError(f"Invalid CSS selector: '{css_selector}'") + else: + # Auto-detect based on content + if wait_for.startswith('()') or wait_for.startswith('function'): + # It's likely a JavaScript function + return await self.csp_compliant_wait(page, wait_for, timeout) + else: + # Assume it's a CSS selector first + try: + await page.wait_for_selector(wait_for, timeout=timeout) + except Error as e: + if 'Timeout' in str(e): + raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'") + else: + # If it's not a timeout error, it might be an invalid selector + # Let's try to evaluate it as a JavaScript function as a fallback + try: + return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout) + except Error: + raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. " + "It should be either a valid CSS selector, a JavaScript function, " + "or explicitly prefixed with 'js:' or 'css:'.") + + async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000): + wrapper_js = f""" + async () => {{ + const userFunction = {user_wait_function}; + const startTime = Date.now(); + while (true) {{ + if (await userFunction()) {{ + return true; + }} + if (Date.now() - startTime > {timeout}) {{ + throw new Error('Timeout waiting for condition'); + }} + await new Promise(resolve => setTimeout(resolve, 100)); + }} + }} + """ + + try: + await page.evaluate(wrapper_js) + except TimeoutError: + raise TimeoutError(f"Timeout after {timeout}ms waiting for condition") + except Exception as e: + raise RuntimeError(f"Error in wait condition: {str(e)}") + + async def process_iframes(self, page): + # Find all iframes + iframes = await page.query_selector_all('iframe') + + for i, iframe in enumerate(iframes): + try: + # Add a unique identifier to the iframe + await iframe.evaluate(f'(element) => element.id = "iframe-{i}"') + + # Get the frame associated with this iframe + frame = await iframe.content_frame() + + if frame: + # Wait for the frame to load + await frame.wait_for_load_state('load', timeout=30000) # 30 seconds timeout + + # Extract the content of the iframe's body + iframe_content = await frame.evaluate('() => document.body.innerHTML') + + # Generate a unique class name for this iframe + class_name = f'extracted-iframe-content-{i}' + + # Replace the iframe with a div containing the extracted content + _iframe = iframe_content.replace('`', '\\`') + await page.evaluate(f""" + () => {{ + const iframe = document.getElementById('iframe-{i}'); + const div = document.createElement('div'); + div.innerHTML = `{_iframe}`; + div.className = '{class_name}'; + iframe.replaceWith(div); + }} + """) + else: + print(f"Warning: Could not access content frame for iframe {i}") + except Exception as e: + print(f"Error processing iframe {i}: {str(e)}") + + # Return the page object + return page + + async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: + response_headers = {} + status_code = None + + self._cleanup_expired_sessions() + session_id = kwargs.get("session_id") + + # Handle page creation differently for managed browser + if self.use_managed_browser: + if session_id: + # Reuse existing session if available + context, page, _ = self.sessions.get(session_id, (None, None, None)) + if not page: + # Create new page in default context if session doesn't exist + page = await self.default_context.new_page() + self.sessions[session_id] = (self.default_context, page, time.time()) + else: + # Create new page in default context for non-session requests + page = await self.default_context.new_page() + else: + if session_id: + context, page, _ = self.sessions.get(session_id, (None, None, None)) + if not context: + context = await self.browser.new_context( + user_agent=self.user_agent, + viewport={"width": 1920, "height": 1080}, + proxy={"server": self.proxy} if self.proxy else None, + accept_downloads=True, + java_script_enabled=True + ) + await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) + await context.set_extra_http_headers(self.headers) + page = await context.new_page() + self.sessions[session_id] = (context, page, time.time()) + else: + context = await self.browser.new_context( + user_agent=self.user_agent, + viewport={"width": 1920, "height": 1080}, + proxy={"server": self.proxy} if self.proxy else None + ) + await context.set_extra_http_headers(self.headers) + + if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False): + # Inject scripts to override navigator properties + await context.add_init_script(""" + // Pass the Permissions Test. + const originalQuery = window.navigator.permissions.query; + window.navigator.permissions.query = (parameters) => ( + parameters.name === 'notifications' ? + Promise.resolve({ state: Notification.permission }) : + originalQuery(parameters) + ); + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }); + window.navigator.chrome = { + runtime: {}, + // Add other properties if necessary + }; + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5], + }); + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en'], + }); + Object.defineProperty(document, 'hidden', { + get: () => false + }); + Object.defineProperty(document, 'visibilityState', { + get: () => 'visible' + }); + """) + + page = await context.new_page() + if kwargs.get("magic", False): + await stealth_async(page, stealth_config) + + # Add console message and error logging + if kwargs.get("log_console", False): + page.on("console", lambda msg: print(f"Console: {msg.text}")) + page.on("pageerror", lambda exc: print(f"Page Error: {exc}")) + + try: + if self.verbose: + print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...") + + if self.use_cached_html: + cache_file_path = os.path.join( + Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() + ) + if os.path.exists(cache_file_path): + html = "" + with open(cache_file_path, "r") as f: + html = f.read() + # retrieve response headers and status code from cache + with open(cache_file_path + ".meta", "r") as f: + meta = json.load(f) + response_headers = meta.get("response_headers", {}) + status_code = meta.get("status_code") + response = AsyncCrawlResponse( + html=html, response_headers=response_headers, status_code=status_code + ) + return response + + if not kwargs.get("js_only", False): + await self.execute_hook('before_goto', page) + + # response = await page.goto( + # url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000) + # ) + + # Add retry logic for HTTP2 errors + max_retries = kwargs.get("max_retries", 3) + current_try = 0 + + while current_try < max_retries: + try: + response = await page.goto( + url, + # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), + wait_until=kwargs.get("wait_until", "networkidle"), + timeout=kwargs.get("page_timeout", 60000) + ) + break + except Exception as e: + current_try += 1 + if "ERR_HTTP2_PROTOCOL_ERROR" in str(e): + if current_try < max_retries: + # Add exponential backoff + await asyncio.sleep(2 ** current_try) + # Try with different protocol + if 'args' not in kwargs: + kwargs['args'] = [] + kwargs['args'].extend(['--disable-http2']) + continue + if current_try == max_retries: + raise + + # response = await page.goto("about:blank") + # await page.evaluate(f"window.location.href = '{url}'") + + await self.execute_hook('after_goto', page) + + # Get status code and headers + status_code = response.status + response_headers = response.headers + else: + status_code = 200 + response_headers = {} + + # Replace the current wait_for_selector line with this more robust check: + try: + # First wait for body to exist, regardless of visibility + await page.wait_for_selector('body', state='attached', timeout=30000) + + # Then wait for it to become visible by checking CSS + await page.wait_for_function(""" + () => { + const body = document.body; + const style = window.getComputedStyle(body); + return style.display !== 'none' && + style.visibility !== 'hidden' && + style.opacity !== '0'; + } + """, timeout=30000) + + except Error as e: + # If waiting fails, let's try to diagnose the issue + visibility_info = await page.evaluate(""" + () => { + const body = document.body; + const style = window.getComputedStyle(body); + return { + display: style.display, + visibility: style.visibility, + opacity: style.opacity, + hasContent: body.innerHTML.length, + classList: Array.from(body.classList) + } + } + """) + + if self.verbose: + print(f"Body visibility debug info: {visibility_info}") + + # Even if body is hidden, we might still want to proceed + if kwargs.get('ignore_body_visibility', True): + if self.verbose: + print("Proceeding despite hidden body...") + pass + else: + raise Error(f"Body element is hidden: {visibility_info}") + + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + + js_code = kwargs.get("js_code", kwargs.get("js", self.js_code)) + if js_code: + if isinstance(js_code, str): + await page.evaluate(js_code) + elif isinstance(js_code, list): + for js in js_code: + await page.evaluate(js) + + await page.wait_for_load_state('networkidle') + # Check for on execution event + await self.execute_hook('on_execution_started', page) + + if kwargs.get("simulate_user", False) or kwargs.get("magic", False): + # Simulate user interactions + await page.mouse.move(100, 100) + await page.mouse.down() + await page.mouse.up() + await page.keyboard.press('ArrowDown') + + # Handle the wait_for parameter + wait_for = kwargs.get("wait_for") + if wait_for: + try: + await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000)) + except Exception as e: + raise RuntimeError(f"Wait condition failed: {str(e)}") + + # Update image dimensions + update_image_dimensions_js = """ + () => { + return new Promise((resolve) => { + const filterImage = (img) => { + // Filter out images that are too small + if (img.width < 100 && img.height < 100) return false; + + // Filter out images that are not visible + const rect = img.getBoundingClientRect(); + if (rect.width === 0 || rect.height === 0) return false; + + // Filter out images with certain class names (e.g., icons, thumbnails) + if (img.classList.contains('icon') || img.classList.contains('thumbnail')) return false; + + // Filter out images with certain patterns in their src (e.g., placeholder images) + if (img.src.includes('placeholder') || img.src.includes('icon')) return false; + + return true; + }; + + const images = Array.from(document.querySelectorAll('img')).filter(filterImage); + let imagesLeft = images.length; + + if (imagesLeft === 0) { + resolve(); + return; + } + + const checkImage = (img) => { + if (img.complete && img.naturalWidth !== 0) { + img.setAttribute('width', img.naturalWidth); + img.setAttribute('height', img.naturalHeight); + imagesLeft--; + if (imagesLeft === 0) resolve(); + } + }; + + images.forEach(img => { + checkImage(img); + if (!img.complete) { + img.onload = () => { + checkImage(img); + }; + img.onerror = () => { + imagesLeft--; + if (imagesLeft === 0) resolve(); + }; + } + }); + + // Fallback timeout of 5 seconds + // setTimeout(() => resolve(), 5000); + resolve(); + }); + } + """ + await page.evaluate(update_image_dimensions_js) + + # Wait a bit for any onload events to complete + await page.wait_for_timeout(100) + + # Process iframes + if kwargs.get("process_iframes", False): + page = await self.process_iframes(page) + + await self.execute_hook('before_retrieve_html', page) + # Check if delay_before_return_html is set then wait for that time + delay_before_return_html = kwargs.get("delay_before_return_html") + if delay_before_return_html: + await asyncio.sleep(delay_before_return_html) + + # Check for remove_overlay_elements parameter + if kwargs.get("remove_overlay_elements", False): + await self.remove_overlay_elements(page) + + html = await page.content() + await self.execute_hook('before_return_html', page, html) + + # Check if kwargs has screenshot=True then take screenshot + screenshot_data = None + if kwargs.get("screenshot"): + # Check we have screenshot_wait_for parameter, if we have simply wait for that time + screenshot_wait_for = kwargs.get("screenshot_wait_for") + if screenshot_wait_for: + await asyncio.sleep(screenshot_wait_for) + screenshot_data = await self.take_screenshot(page) + + if self.verbose: + print(f"[LOG] ✅ Crawled {url} successfully!") + + if self.use_cached_html: + cache_file_path = os.path.join( + Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() + ) + with open(cache_file_path, "w", encoding="utf-8") as f: + f.write(html) + # store response headers and status code in cache + with open(cache_file_path + ".meta", "w", encoding="utf-8") as f: + json.dump({ + "response_headers": response_headers, + "status_code": status_code + }, f) + + async def get_delayed_content(delay: float = 5.0) -> str: + if self.verbose: + print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}") + await asyncio.sleep(delay) + return await page.content() + + response = AsyncCrawlResponse( + html=html, + response_headers=response_headers, + status_code=status_code, + screenshot=screenshot_data, + get_delayed_content=get_delayed_content + ) + return response + except Error as e: + raise Error(f"[ERROR] 🚫 crawl(): Failed to crawl {url}: {str(e)}") + # finally: + # if not session_id: + # await page.close() + # await context.close() + + async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: + semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed + semaphore = asyncio.Semaphore(semaphore_count) + + async def crawl_with_semaphore(url): + async with semaphore: + return await self.crawl(url, **kwargs) + + tasks = [crawl_with_semaphore(url) for url in urls] + results = await asyncio.gather(*tasks, return_exceptions=True) + return [result if not isinstance(result, Exception) else str(result) for result in results] + + async def remove_overlay_elements(self, page: Page) -> None: + """ + Removes popup overlays, modals, cookie notices, and other intrusive elements from the page. + + Args: + page (Page): The Playwright page instance + """ + remove_overlays_js = """ + async () => { + // Function to check if element is visible + const isVisible = (elem) => { + const style = window.getComputedStyle(elem); + return style.display !== 'none' && + style.visibility !== 'hidden' && + style.opacity !== '0'; + }; + + // Common selectors for popups and overlays + const commonSelectors = [ + // Close buttons first + 'button[class*="close" i]', 'button[class*="dismiss" i]', + 'button[aria-label*="close" i]', 'button[title*="close" i]', + 'a[class*="close" i]', 'span[class*="close" i]', + + // Cookie notices + '[class*="cookie-banner" i]', '[id*="cookie-banner" i]', + '[class*="cookie-consent" i]', '[id*="cookie-consent" i]', + + // Newsletter/subscription dialogs + '[class*="newsletter" i]', '[class*="subscribe" i]', + + // Generic popups/modals + '[class*="popup" i]', '[class*="modal" i]', + '[class*="overlay" i]', '[class*="dialog" i]', + '[role="dialog"]', '[role="alertdialog"]' + ]; + + // Try to click close buttons first + for (const selector of commonSelectors.slice(0, 6)) { + const closeButtons = document.querySelectorAll(selector); + for (const button of closeButtons) { + if (isVisible(button)) { + try { + button.click(); + await new Promise(resolve => setTimeout(resolve, 100)); + } catch (e) { + console.log('Error clicking button:', e); + } + } + } + } + + // Remove remaining overlay elements + const removeOverlays = () => { + // Find elements with high z-index + const allElements = document.querySelectorAll('*'); + for (const elem of allElements) { + const style = window.getComputedStyle(elem); + const zIndex = parseInt(style.zIndex); + const position = style.position; + + if ( + isVisible(elem) && + (zIndex > 999 || position === 'fixed' || position === 'absolute') && + ( + elem.offsetWidth > window.innerWidth * 0.5 || + elem.offsetHeight > window.innerHeight * 0.5 || + style.backgroundColor.includes('rgba') || + parseFloat(style.opacity) < 1 + ) + ) { + elem.remove(); + } + } + + // Remove elements matching common selectors + for (const selector of commonSelectors) { + const elements = document.querySelectorAll(selector); + elements.forEach(elem => { + if (isVisible(elem)) { + elem.remove(); + } + }); + } + }; + + // Remove overlay elements + removeOverlays(); + + // Remove any fixed/sticky position elements at the top/bottom + const removeFixedElements = () => { + const elements = document.querySelectorAll('*'); + elements.forEach(elem => { + const style = window.getComputedStyle(elem); + if ( + (style.position === 'fixed' || style.position === 'sticky') && + isVisible(elem) + ) { + elem.remove(); + } + }); + }; + + removeFixedElements(); + + // Remove empty block elements as: div, p, span, etc. + const removeEmptyBlockElements = () => { + const blockElements = document.querySelectorAll('div, p, span, section, article, header, footer, aside, nav, main, ul, ol, li, dl, dt, dd, h1, h2, h3, h4, h5, h6'); + blockElements.forEach(elem => { + if (elem.innerText.trim() === '') { + elem.remove(); + } + }); + }; + + // Remove margin-right and padding-right from body (often added by modal scripts) + document.body.style.marginRight = '0px'; + document.body.style.paddingRight = '0px'; + document.body.style.overflow = 'auto'; + + // Wait a bit for any animations to complete + await new Promise(resolve => setTimeout(resolve, 100)); + } + """ + + try: + await page.evaluate(remove_overlays_js) + await page.wait_for_timeout(500) # Wait for any animations to complete + except Exception as e: + if self.verbose: + print(f"Warning: Failed to remove overlay elements: {str(e)}") + + async def take_screenshot(self, page: Page) -> str: + try: + # The page is already loaded, just take the screenshot + screenshot = await page.screenshot(full_page=True) + return base64.b64encode(screenshot).decode('utf-8') + except Exception as e: + error_message = f"Failed to take screenshot: {str(e)}" + print(error_message) + + # Generate an error image + img = Image.new('RGB', (800, 600), color='black') + draw = ImageDraw.Draw(img) + font = ImageFont.load_default() + draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + + buffered = BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode('utf-8') + finally: + await page.close() + diff --git a/main.py b/main.py index 853cd0b7..a5da029c 100644 --- a/main.py +++ b/main.py @@ -321,7 +321,12 @@ app.add_middleware( # Mount the pages directory as a static directory app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages") -app.mount("/mkdocs", StaticFiles(directory="site", html=True), name="mkdocs") + +# Check if site directory exists +if os.path.exists(__location__ + "/site"): + # Mount the site directory as a static directory + app.mount("/mkdocs", StaticFiles(directory="site", html=True), name="mkdocs") + site_templates = Jinja2Templates(directory=__location__ + "/site") templates = Jinja2Templates(directory=__location__ + "/pages") @@ -337,7 +342,10 @@ async def shutdown_event(): @app.get("/") def read_root(): - return RedirectResponse(url="/mkdocs") + if os.path.exists(__location__ + "/site"): + return RedirectResponse(url="/mkdocs") + # Return a json response + return {"message": "Crawl4AI API service is running"} @app.post("/crawl") diff --git a/requirements.txt b/requirements.txt index 9a942958..e83643b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,4 @@ playwright>=1.47,<1.48 python-dotenv~=1.0 requests~=2.26 beautifulsoup4~=4.12 -playwright_stealth~=1.0 +tf-playwright-stealth~=1.0 From 00026b5f8b9aec4ef5f4aa1fde8594c8118de74f Mon Sep 17 00:00:00 2001 From: Mahesh Date: Tue, 12 Nov 2024 14:52:51 -0700 Subject: [PATCH 014/115] feat(config): Adding a configurable way of setting the cache directory for constrained environments --- crawl4ai/async_crawler_strategy.py | 4 ++-- crawl4ai/async_database.py | 2 +- crawl4ai/async_webcrawler.py | 4 ++-- crawl4ai/crawler_strategy.py | 6 +++--- crawl4ai/database.py | 2 +- crawl4ai/model_loader.py | 2 +- crawl4ai/utils.py | 2 +- crawl4ai/web_crawler.py | 2 +- docs/md_v2/api/async-webcrawler.md | 2 +- setup.py | 2 +- 10 files changed, 14 insertions(+), 14 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index fa50e7b5..9af9f826 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -525,7 +525,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.use_cached_html: cache_file_path = os.path.join( - Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() + os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() ) if os.path.exists(cache_file_path): html = "" @@ -725,7 +725,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.use_cached_html: cache_file_path = os.path.join( - Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() + os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() ) with open(cache_file_path, "w", encoding="utf-8") as f: f.write(html) diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index 78931d28..249c4b31 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -10,7 +10,7 @@ import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -DB_PATH = os.path.join(Path.home(), ".crawl4ai") +DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") os.makedirs(DB_PATH, exist_ok=True) DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index ceb9ad28..38e429ca 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -23,14 +23,14 @@ class AsyncWebCrawler: self, crawler_strategy: Optional[AsyncCrawlerStrategy] = None, always_by_pass_cache: bool = False, - base_directory: str = str(Path.home()), + base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), **kwargs, ): self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( **kwargs ) self.always_by_pass_cache = always_by_pass_cache - # self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") + # self.crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai") os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index 5d6864b5..ce802e49 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -132,7 +132,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): # chromedriver_autoinstaller.install() # import chromedriver_autoinstaller - # crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") + # crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") # driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options) # chromedriver_path = chromedriver_autoinstaller.install() # chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver() @@ -205,7 +205,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): url_hash = hashlib.md5(url.encode()).hexdigest() if self.use_cached_html: - cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash) + cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash) if os.path.exists(cache_file_path): with open(cache_file_path, "r") as f: return sanitize_input_encode(f.read()) @@ -275,7 +275,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): self.driver = self.execute_hook('before_return_html', self.driver, html) # Store in cache - cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash) + cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash) with open(cache_file_path, "w", encoding="utf-8") as f: f.write(html) diff --git a/crawl4ai/database.py b/crawl4ai/database.py index 37d94463..42ad7017 100644 --- a/crawl4ai/database.py +++ b/crawl4ai/database.py @@ -3,7 +3,7 @@ from pathlib import Path import sqlite3 from typing import Optional, Tuple -DB_PATH = os.path.join(Path.home(), ".crawl4ai") +DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") os.makedirs(DB_PATH, exist_ok=True) DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") diff --git a/crawl4ai/model_loader.py b/crawl4ai/model_loader.py index 7b3a2846..d1872d7e 100644 --- a/crawl4ai/model_loader.py +++ b/crawl4ai/model_loader.py @@ -56,7 +56,7 @@ def set_model_device(model): @lru_cache() def get_home_folder(): - home_folder = os.path.join(Path.home(), ".crawl4ai") + home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") os.makedirs(home_folder, exist_ok=True) os.makedirs(f"{home_folder}/cache", exist_ok=True) os.makedirs(f"{home_folder}/models", exist_ok=True) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index d96f1ded..1f15dea1 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -60,7 +60,7 @@ def get_system_memory(): raise OSError("Unsupported operating system") def get_home_folder(): - home_folder = os.path.join(Path.home(), ".crawl4ai") + home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), ".crawl4ai") os.makedirs(home_folder, exist_ok=True) os.makedirs(f"{home_folder}/cache", exist_ok=True) os.makedirs(f"{home_folder}/models", exist_ok=True) diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 20e9b04e..d44de183 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -20,7 +20,7 @@ class WebCrawler: def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False): self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose) self.always_by_pass_cache = always_by_pass_cache - self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") + self.crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) init_db() diff --git a/docs/md_v2/api/async-webcrawler.md b/docs/md_v2/api/async-webcrawler.md index 25164f6c..be956101 100644 --- a/docs/md_v2/api/async-webcrawler.md +++ b/docs/md_v2/api/async-webcrawler.md @@ -13,7 +13,7 @@ AsyncWebCrawler( # Cache Settings always_by_pass_cache: bool = False, # Always bypass cache - base_directory: str = str(Path.home()), # Base directory for cache + base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), # Base directory for cache # Network Settings proxy: str = None, # Simple proxy URL diff --git a/setup.py b/setup.py index 93190291..90063212 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ import sys # Create the .crawl4ai folder in the user's home directory if it doesn't exist # If the folder already exists, remove the cache folder -crawl4ai_folder = Path.home() / ".crawl4ai" +crawl4ai_folder = os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()) / ".crawl4ai" cache_folder = crawl4ai_folder / "cache" if cache_folder.exists(): From bf91adf3f84ade380b819f55c444ed87c80c032b Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 13 Nov 2024 15:37:16 +0800 Subject: [PATCH 015/115] fix: Resolve unexpected BrowserContext closure during crawl in Docker - Removed __del__ method in AsyncPlaywrightCrawlerStrategy to ensure reliable browser lifecycle management by using explicit context managers. - Added process monitoring in ManagedBrowser to detect and log unexpected terminations of the browser subprocess. - Updated Docker configuration to expose port 9222 for remote debugging and allocate extra shared memory to prevent browser crashes. - Improved error handling and resource cleanup for browser instances, particularly in Docker environments. Resolves Issue #256 --- .gitignore | 1 + Dockerfile | 7 ++++++- README.md | 5 ++++- crawl4ai/__init__.py | 4 ++-- crawl4ai/async_crawler_strategy.py | 22 +++++++++++++++++++--- crawl4ai/async_webcrawler.py | 14 +++++++------- crawl4ai/config.py | 2 ++ crawl4ai/web_crawler.py | 17 ++++++++++++++++- 8 files changed, 57 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index 4c3e151e..aca02959 100644 --- a/.gitignore +++ b/.gitignore @@ -199,6 +199,7 @@ test_env/ **/.DS_Store todo.md +todo_executor.md git_changes.py git_changes.md pypi_build.sh diff --git a/Dockerfile b/Dockerfile index 9a921d03..125fb9b8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -115,7 +115,12 @@ HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \ CMD curl -f http://localhost:8000/health || exit 1 # Expose port -EXPOSE 8000 +EXPOSE 8000 11235 9222 8080 + +# Optional: Increase shared memory size to prevent browser crashes +# when loading heavy pages +RUN mkdir /dev/shm +VOLUME /dev/shm # Start the FastAPI server CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"] \ No newline at end of file diff --git a/README.md b/README.md index e1a64aa1..d250f936 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scrapper +# 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scraper unclecode%2Fcrawl4ai | Trendshift @@ -127,6 +127,9 @@ docker pull unclecode/crawl4ai:gpu # GPU-enabled version # Run the container docker run -p 11235:11235 unclecode/crawl4ai:basic # Replace 'basic' with your chosen version + +# In case to allocate more shared memory for the container +docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic ``` #### Option 2: Build from Repository diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 0c6a2db4..1bcc491c 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -26,5 +26,5 @@ if is_sync_version_installed(): print("Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies.") else: WebCrawler = None - import warnings - print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.") \ No newline at end of file + # import warnings + # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.") \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 896a0644..57288b59 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -64,12 +64,27 @@ class ManagedBrowser: stdout=subprocess.PIPE, stderr=subprocess.PIPE ) + # Monitor browser process output for errors + asyncio.create_task(self._monitor_browser_process()) await asyncio.sleep(2) # Give browser time to start return f"http://localhost:{self.debugging_port}" except Exception as e: await self.cleanup() raise Exception(f"Failed to start browser: {e}") + async def _monitor_browser_process(self): + """Monitor the browser process for unexpected termination.""" + if self.browser_process: + stdout, stderr = await asyncio.gather( + asyncio.to_thread(self.browser_process.stdout.read), + asyncio.to_thread(self.browser_process.stderr.read) + ) + if self.browser_process.poll() is not None: + print(f"Browser process terminated unexpectedly with code {self.browser_process.returncode}") + print(f"STDOUT: {stdout.decode()}") + print(f"STDERR: {stderr.decode()}") + await self.cleanup() + def _get_browser_path(self) -> str: """Returns the browser executable path based on OS and browser type""" if sys.platform == "darwin": # macOS @@ -330,9 +345,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.playwright.stop() self.playwright = None - def __del__(self): - if self.browser or self.playwright: - asyncio.get_event_loop().run_until_complete(self.close()) + # Issue #256: Remove __del__ method to avoid potential issues with async cleanup + # def __del__(self): + # if self.browser or self.playwright: + # asyncio.get_event_loop().run_until_complete(self.close()) def set_hook(self, hook_type: str, hook: Callable): if hook_type in self.hooks: diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index ceb9ad28..f580776b 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -47,17 +47,17 @@ class AsyncWebCrawler: async def awarmup(self): # Print a message for crawl4ai and its version - print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}") if self.verbose: + print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}") print("[LOG] 🌤️ Warming up the AsyncWebCrawler") # await async_db_manager.ainit_db() await async_db_manager.initialize() - await self.arun( - url="https://google.com/", - word_count_threshold=5, - bypass_cache=False, - verbose=False, - ) + # await self.arun( + # url="https://google.com/", + # word_count_threshold=5, + # bypass_cache=False, + # verbose=False, + # ) self.ready = True if self.verbose: print("[LOG] 🌞 AsyncWebCrawler is ready to crawl") diff --git a/crawl4ai/config.py b/crawl4ai/config.py index a07ca977..16638b6d 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -51,3 +51,5 @@ SOCIAL_MEDIA_DOMAINS = [ # If image format is in jpg, png or webp # If image is in the first half of the total images extracted from the page IMAGE_SCORE_THRESHOLD = 2 + +MAX_METRICS_HISTORY = 1000 \ No newline at end of file diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 20e9b04e..95af6c7a 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -10,6 +10,7 @@ from .extraction_strategy import * from .crawler_strategy import * from typing import List from concurrent.futures import ThreadPoolExecutor +from .content_scrapping_strategy import WebScrappingStrategy from .config import * import warnings import json @@ -181,7 +182,21 @@ class WebCrawler: # Extract content from HTML try: t1 = time.time() - result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False)) + scrapping_strategy = WebScrappingStrategy() + extra_params = {k: v for k, v in kwargs.items() if k not in ["only_text", "image_description_min_word_threshold"]} + result = scrapping_strategy.scrap( + url, + html, + word_count_threshold=word_count_threshold, + css_selector=css_selector, + only_text=kwargs.get("only_text", False), + image_description_min_word_threshold=kwargs.get( + "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD + ), + **extra_params, + ) + + # result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False)) if verbose: print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds") From 61b93ebf362205e2c96c5c2d74bc1b880ca59f51 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 13 Nov 2024 15:38:30 +0800 Subject: [PATCH 016/115] Update change log --- CHANGELOG.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 583c7807..ff52e10e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,11 @@ # Changelog -# CHANGELOG +## [v0.3.731] - 2024-11-13 Changelog for Issue 256 Fix +- Fixed: Browser context unexpectedly closing in Docker environment during crawl operations. +- Removed: __del__ method from AsyncPlaywrightCrawlerStrategy to prevent unreliable asynchronous cleanup, ensuring - browser context is closed explicitly within context managers. +- Added: Monitoring for ManagedBrowser subprocess to detect and log unexpected terminations. +- Updated: Dockerfile configurations to expose debugging port (9222) and allocate additional shared memory for improved browser stability. +- Improved: Error handling and resource cleanup processes for browser lifecycle management within the Docker environment. ## [v0.3.73] - 2024-11-05 From c38ac29edbcebcb2f3672145424e7af3193caa6e Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 13 Nov 2024 19:40:40 +0800 Subject: [PATCH 017/115] perf(crawler): major performance improvements & raw HTML support - Switch to lxml parser (~4x speedup) - Add raw HTML & local file crawling support - Fix cache headers & async cleanup - Add browser process monitoring - Optimize BeautifulSoup operations - Pre-compile regex patterns Breaking: Raw HTML handling requires new URL prefixes Fixes: #256, #253 --- CHANGELOG.md | 33 +- crawl4ai/async_crawler_strategy.py | 114 +- crawl4ai/async_database.py | 67 +- crawl4ai/async_webcrawler.py | 127 +- crawl4ai/content_cleaning_strategy.py | 10 +- crawl4ai/content_scrapping_strategy.py | 102 +- crawl4ai/utils.py | 50 +- crawl4ai/web_crawler.py | 4 +- docs/md_v2/basic/prefix-based-input.md | 235 ++ tests/async/sample_wikipedia.html | 2179 ++++++++++++++++++ tests/async/test_content_scraper_strategy.py | 162 ++ 11 files changed, 2953 insertions(+), 130 deletions(-) create mode 100644 docs/md_v2/basic/prefix-based-input.md create mode 100644 tests/async/sample_wikipedia.html create mode 100644 tests/async/test_content_scraper_strategy.py diff --git a/CHANGELOG.md b/CHANGELOG.md index ff52e10e..33d09184 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,30 @@ # Changelog +# Changelog - November 13, 2024 + +### Added +- Support for raw HTML and local file crawling via URL prefixes ('raw:', 'file://') +- Browser process monitoring for managed browser instances +- Screenshot capability for raw HTML and local file content +- Response headers storage in cache database +- New `fit_markdown` flag for optional markdown generation + +### Changed +- Switched HTML parser from 'html.parser' to 'lxml' for ~4x performance improvement +- Optimized BeautifulSoup text conversion and element selection +- Pre-compiled regular expressions for better performance +- Improved metadata extraction efficiency +- Response headers now stored alongside HTML in cache + +### Removed +- `__del__` method from AsyncPlaywrightCrawlerStrategy to prevent async cleanup issues + +### Fixed +- Issue #256: Added support for crawling raw HTML content +- Issue #253: Implemented file:// protocol handling +- Missing response headers in cached results +- Memory leaks from improper async cleanup + ## [v0.3.731] - 2024-11-13 Changelog for Issue 256 Fix - Fixed: Browser context unexpectedly closing in Docker environment during crawl operations. - Removed: __del__ method from AsyncPlaywrightCrawlerStrategy to prevent unreliable asynchronous cleanup, ensuring - browser context is closed explicitly within context managers. @@ -185,7 +210,7 @@ This commit introduces several key enhancements, including improved error handli ## [v0.3.72] - 2024-10-20 ### Fixed -- Added support for parsing Base64 encoded images in WebScrappingStrategy +- Added support for parsing Base64 encoded images in WebScrapingStrategy ### Added - Forked and integrated a customized version of the html2text library for more control over Markdown generation @@ -208,7 +233,7 @@ This commit introduces several key enhancements, including improved error handli ### Developer Notes - The customized html2text library is now located within the crawl4ai package - New configuration options are available in the `config.py` file for external content handling -- The `WebScrappingStrategy` class has been updated to accommodate new external content exclusion options +- The `WebScrapingStrategy` class has been updated to accommodate new external content exclusion options ## [v0.3.71] - 2024-10-19 @@ -285,7 +310,7 @@ These updates aim to provide more flexibility in text processing, improve perfor ### Improvements 1. **Better Error Handling**: - - Enhanced error reporting in WebScrappingStrategy with detailed error messages and suggestions. + - Enhanced error reporting in WebScrapingStrategy with detailed error messages and suggestions. - Added console message and error logging for better debugging. 2. **Image Processing Enhancements**: @@ -350,7 +375,7 @@ These updates aim to provide more flexibility in text processing, improve perfor - Allows for more customized setups. ### 2. Image Processing Optimization -- Enhanced image handling in WebScrappingStrategy. +- Enhanced image handling in WebScrapingStrategy. - Added filtering for small, invisible, or irrelevant images. - Improved image scoring system for better content relevance. - Implemented JavaScript-based image dimension updating for more accurate representation. diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 57288b59..baa06e47 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -84,7 +84,7 @@ class ManagedBrowser: print(f"STDOUT: {stdout.decode()}") print(f"STDERR: {stderr.decode()}") await self.cleanup() - + def _get_browser_path(self) -> str: """Returns the browser executable path based on OS and browser type""" if sys.platform == "darwin": # macOS @@ -493,6 +493,75 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return page async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: + """ + Crawls a given URL or processes raw HTML/local file content based on the URL prefix. + + Args: + url (str): The URL to crawl. Supported prefixes: + - 'http://' or 'https://': Web URL to crawl. + - 'file://': Local file path to process. + - 'raw:': Raw HTML content to process. + **kwargs: Additional parameters: + - 'screenshot' (bool): Whether to take a screenshot. + - ... [other existing parameters] + + Returns: + AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot. + """ + response_headers = {} + status_code = 200 # Default to 200 for local/raw HTML + screenshot_requested = kwargs.get('screenshot', False) + screenshot_data = None + + if url.startswith(('http://', 'https://')): + # Proceed with standard web crawling + return await self._crawl_web(url, **kwargs) + + elif url.startswith('file://'): + # Process local file + local_file_path = url[7:] # Remove 'file://' prefix + if not os.path.exists(local_file_path): + raise FileNotFoundError(f"Local file not found: {local_file_path}") + with open(local_file_path, 'r', encoding='utf-8') as f: + html = f.read() + if screenshot_requested: + screenshot_data = await self._generate_screenshot_from_html(html) + return AsyncCrawlResponse( + html=html, + response_headers=response_headers, + status_code=status_code, + screenshot=screenshot_data, + get_delayed_content=None + ) + + elif url.startswith('raw:'): + # Process raw HTML content + raw_html = url[4:] # Remove 'raw:' prefix + html = raw_html + if screenshot_requested: + screenshot_data = await self._generate_screenshot_from_html(html) + return AsyncCrawlResponse( + html=html, + response_headers=response_headers, + status_code=status_code, + screenshot=screenshot_data, + get_delayed_content=None + ) + else: + raise ValueError("URL must start with 'http://', 'https://', 'file://', or 'raw:'") + + + async def _crawl_web(self, url: str, **kwargs) -> AsyncCrawlResponse: + """ + Existing web crawling logic remains unchanged. + + Args: + url (str): The web URL to crawl. + **kwargs: Additional parameters. + + Returns: + AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot. + """ response_headers = {} status_code = None @@ -792,7 +861,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.verbose: print(f"[LOG] ✅ Crawled {url} successfully!") - + if self.use_cached_html: cache_file_path = os.path.join( Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() @@ -972,6 +1041,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): print(f"Warning: Failed to remove overlay elements: {str(e)}") async def take_screenshot(self, page: Page) -> str: + """ + Takes a screenshot of the current page. + + Args: + page (Page): The Playwright page instance + + Returns: + str: Base64-encoded screenshot image + """ try: # The page is already loaded, just take the screenshot screenshot = await page.screenshot(full_page=True) @@ -991,4 +1069,36 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return base64.b64encode(buffered.getvalue()).decode('utf-8') finally: await page.close() + + async def _generate_screenshot_from_html(self, html: str) -> Optional[str]: + """ + Generates a screenshot from raw HTML content. + + Args: + html (str): The HTML content to render and capture. + + Returns: + Optional[str]: Base64-encoded screenshot image or an error image if failed. + """ + try: + if not self.browser: + await self.start() + page = await self.browser.new_page() + await page.set_content(html, wait_until='networkidle') + screenshot = await page.screenshot(full_page=True) + await page.close() + return base64.b64encode(screenshot).decode('utf-8') + except Exception as e: + error_message = f"Failed to take screenshot: {str(e)}" + print(error_message) + + # Generate an error image + img = Image.new('RGB', (800, 600), color='black') + draw = ImageDraw.Draw(img) + font = ImageFont.load_default() + draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + + buffered = BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode('utf-8') diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index 78931d28..273ca6c9 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -5,6 +5,7 @@ import asyncio from typing import Optional, Tuple, Dict from contextlib import asynccontextmanager import logging +import json # Added for serialization/deserialization # Set up logging logging.basicConfig(level=logging.INFO) @@ -89,7 +90,8 @@ class AsyncDatabaseManager: media TEXT DEFAULT "{}", links TEXT DEFAULT "{}", metadata TEXT DEFAULT "{}", - screenshot TEXT DEFAULT "" + screenshot TEXT DEFAULT "", + response_headers TEXT DEFAULT "{}" -- New column added ) ''') @@ -105,26 +107,51 @@ class AsyncDatabaseManager: column_names = await self.execute_with_retry(_check_columns) - for column in ['media', 'links', 'metadata', 'screenshot']: + # List of new columns to add + new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers'] + + for column in new_columns: if column not in column_names: await self.aalter_db_add_column(column) async def aalter_db_add_column(self, new_column: str): """Add new column to the database""" async def _alter(db): - await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') + if new_column == 'response_headers': + await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"') + else: + await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') logger.info(f"Added column '{new_column}' to the database.") await self.execute_with_retry(_alter) - async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]: + async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, bool, str, str, str, str]]: """Retrieve cached URL data""" async def _get(db): async with db.execute( - 'SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?', + ''' + SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers + FROM crawled_data WHERE url = ? + ''', (url,) ) as cursor: - return await cursor.fetchone() + row = await cursor.fetchone() + if row: + # Deserialize JSON fields + return ( + row[0], # url + row[1], # html + row[2], # cleaned_html + row[3], # markdown + row[4], # extracted_content + row[5], # success + json.loads(row[6] or '{}'), # media + json.loads(row[7] or '{}'), # links + json.loads(row[8] or '{}'), # metadata + row[9], # screenshot + json.loads(row[10] or '{}') # response_headers + ) + return None try: return await self.execute_with_retry(_get) @@ -132,12 +159,27 @@ class AsyncDatabaseManager: logger.error(f"Error retrieving cached URL: {e}") return None - async def acache_url(self, url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media: str = "{}", links: str = "{}", metadata: str = "{}", screenshot: str = ""): + async def acache_url( + self, + url: str, + html: str, + cleaned_html: str, + markdown: str, + extracted_content: str, + success: bool, + media: str = "{}", + links: str = "{}", + metadata: str = "{}", + screenshot: str = "", + response_headers: str = "{}" # New parameter added + ): """Cache URL data with retry logic""" async def _cache(db): await db.execute(''' - INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + INSERT INTO crawled_data ( + url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(url) DO UPDATE SET html = excluded.html, cleaned_html = excluded.cleaned_html, @@ -147,8 +189,9 @@ class AsyncDatabaseManager: media = excluded.media, links = excluded.links, metadata = excluded.metadata, - screenshot = excluded.screenshot - ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)) + screenshot = excluded.screenshot, + response_headers = excluded.response_headers -- Update response_headers + ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers)) try: await self.execute_with_retry(_cache) @@ -189,4 +232,4 @@ class AsyncDatabaseManager: logger.error(f"Error flushing database: {e}") # Create a singleton instance -async_db_manager = AsyncDatabaseManager() \ No newline at end of file +async_db_manager = AsyncDatabaseManager() diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index f580776b..9d0340dc 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -9,7 +9,7 @@ from .async_database import async_db_manager from .chunking_strategy import * from .extraction_strategy import * from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse -from .content_scrapping_strategy import WebScrappingStrategy +from .content_scrapping_strategy import WebScrapingStrategy from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD from .utils import ( sanitize_input_encode, @@ -47,17 +47,17 @@ class AsyncWebCrawler: async def awarmup(self): # Print a message for crawl4ai and its version + print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}") if self.verbose: - print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}") print("[LOG] 🌤️ Warming up the AsyncWebCrawler") # await async_db_manager.ainit_db() await async_db_manager.initialize() - # await self.arun( - # url="https://google.com/", - # word_count_threshold=5, - # bypass_cache=False, - # verbose=False, - # ) + await self.arun( + url="https://google.com/", + word_count_threshold=5, + bypass_cache=False, + verbose=False, + ) self.ready = True if self.verbose: print("[LOG] 🌞 AsyncWebCrawler is ready to crawl") @@ -75,6 +75,19 @@ class AsyncWebCrawler: verbose=True, **kwargs, ) -> CrawlResult: + """ + Runs the crawler for a single source: URL (web, local file, or raw HTML). + + Args: + url (str): The URL to crawl. Supported prefixes: + - 'http://' or 'https://': Web URL to crawl. + - 'file://': Local file path to process. + - 'raw:': Raw HTML content to process. + ... [other existing parameters] + + Returns: + CrawlResult: The result of the crawling and processing. + """ try: extraction_strategy = extraction_strategy or NoExtractionStrategy() extraction_strategy.verbose = verbose @@ -89,8 +102,13 @@ class AsyncWebCrawler: cached = None screenshot_data = None extracted_content = None - if not bypass_cache and not self.always_by_pass_cache: + + is_web_url = url.startswith(('http://', 'https://')) + if is_web_url and not bypass_cache and not self.always_by_pass_cache: cached = await async_db_manager.aget_cached_url(url) + + # if not bypass_cache and not self.always_by_pass_cache: + # cached = await async_db_manager.aget_cached_url(url) if kwargs.get("warmup", True) and not self.ready: return None @@ -117,25 +135,32 @@ class AsyncWebCrawler: ) crawl_result = await self.aprocess_html( - url, - html, - extracted_content, - word_count_threshold, - extraction_strategy, - chunking_strategy, - css_selector, - screenshot_data, - verbose, - bool(cached), + url=url, + html=html, + extracted_content=extracted_content, + word_count_threshold=word_count_threshold, + extraction_strategy=extraction_strategy, + chunking_strategy=chunking_strategy, + css_selector=css_selector, + screenshot=screenshot_data, + verbose=verbose, + is_cached=bool(cached), async_response=async_response, bypass_cache=bypass_cache, **kwargs, ) - crawl_result.status_code = async_response.status_code if async_response else 200 - crawl_result.response_headers = async_response.response_headers if async_response else {} + + if async_response: + crawl_result.status_code = async_response.status_code + crawl_result.response_headers = async_response.response_headers + else: + crawl_result.status_code = 200 + crawl_result.response_headers = cached[10] + crawl_result.success = bool(html) crawl_result.session_id = kwargs.get("session_id", None) return crawl_result + except Exception as e: if not hasattr(e, "msg"): e.msg = str(e) @@ -155,22 +180,40 @@ class AsyncWebCrawler: verbose=True, **kwargs, ) -> List[CrawlResult]: - tasks = [ - self.arun( - url, - word_count_threshold, - extraction_strategy, - chunking_strategy, - bypass_cache, - css_selector, - screenshot, - user_agent, - verbose, - **kwargs - ) - for url in urls - ] - return await asyncio.gather(*tasks) + """ + Runs the crawler for multiple sources: URLs (web, local files, or raw HTML). + + Args: + urls (List[str]): A list of URLs with supported prefixes: + - 'http://' or 'https://': Web URL to crawl. + - 'file://': Local file path to process. + - 'raw:': Raw HTML content to process. + ... [other existing parameters] + + Returns: + List[CrawlResult]: The results of the crawling and processing. + """ + semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed + semaphore = asyncio.Semaphore(semaphore_count) + + async def crawl_with_semaphore(url): + async with semaphore: + return await self.arun( + url, + word_count_threshold=word_count_threshold, + extraction_strategy=extraction_strategy, + chunking_strategy=chunking_strategy, + bypass_cache=bypass_cache, + css_selector=css_selector, + screenshot=screenshot, + user_agent=user_agent, + verbose=verbose, + **kwargs, + ) + + tasks = [crawl_with_semaphore(url) for url in urls] + results = await asyncio.gather(*tasks, return_exceptions=True) + return [result if not isinstance(result, Exception) else str(result) for result in results] async def aprocess_html( self, @@ -184,13 +227,14 @@ class AsyncWebCrawler: screenshot: str, verbose: bool, is_cached: bool, + async_response: Optional[AsyncCrawlResponse], **kwargs, ) -> CrawlResult: t = time.time() # Extract content from HTML try: t1 = time.time() - scrapping_strategy = WebScrappingStrategy() + scrapping_strategy = WebScrapingStrategy() # result = await scrapping_strategy.ascrap( result = scrapping_strategy.scrap( url, @@ -245,6 +289,12 @@ class AsyncWebCrawler: ) screenshot = None if not screenshot else screenshot + + response_headers = "{}" # Default value + if async_response: + # Serialize response_headers dict to JSON string + response_headers = json.dumps(async_response.response_headers, ensure_ascii=False) + if not is_cached or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: await async_db_manager.acache_url( @@ -258,6 +308,7 @@ class AsyncWebCrawler: json.dumps(links), json.dumps(metadata), screenshot=screenshot, + response_headers=response_headers, ) return CrawlResult( diff --git a/crawl4ai/content_cleaning_strategy.py b/crawl4ai/content_cleaning_strategy.py index 2f052f76..b8a5053d 100644 --- a/crawl4ai/content_cleaning_strategy.py +++ b/crawl4ai/content_cleaning_strategy.py @@ -15,7 +15,7 @@ class ContentCleaningStrategy: self.link_density_threshold = 0.2 self.max_dom_depth = 10 # To prevent excessive DOM traversal - def clean(self, clean_html: str) -> str: + def clean(self, clean_html: str, soup = None) -> str: """ Main function that takes cleaned HTML and returns super cleaned HTML. @@ -28,18 +28,20 @@ class ContentCleaningStrategy: try: if not clean_html or not isinstance(clean_html, str): return '' - soup = BeautifulSoup(clean_html, 'html.parser') + if not soup: + # soup = BeautifulSoup(clean_html, 'html.parser') + soup = BeautifulSoup(clean_html, 'lxml') main_content = self.extract_main_content(soup) if main_content: super_clean_element = self.clean_element(main_content) - return str(super_clean_element) + return super_clean_element.encode_contents().decode('utf-8') else: return '' except Exception: # Handle exceptions silently or log them as needed return '' - def extract_main_content(self, soup: BeautifulSoup) -> Optional[Tag]: + def extract_main_content(self, soup) -> Optional[Tag]: """ Identifies and extracts the main content element from the HTML. diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index caed7319..a2dbbd96 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -1,3 +1,4 @@ +import re # Point 1: Pre-Compile Regular Expressions from abc import ABC, abstractmethod from typing import Dict, Any from bs4 import BeautifulSoup @@ -105,7 +106,39 @@ class CustomHTML2Text(HTML2Text): return super().handle_data(data, entity_char) -class ContentScrappingStrategy(ABC): +# Pre-compile regular expressions for Open Graph and Twitter metadata +OG_REGEX = re.compile(r'^og:') +TWITTER_REGEX = re.compile(r'^twitter:') +DIMENSION_REGEX = re.compile(r"(\d+)(\D*)") + +# Function to parse image height/width value and units +def parse_dimension(dimension): + if dimension: + # match = re.match(r"(\d+)(\D*)", dimension) + match = DIMENSION_REGEX.match(dimension) + if match: + number = int(match.group(1)) + unit = match.group(2) or 'px' # Default unit is 'px' if not specified + return number, unit + return None, None + +# Fetch image file metadata to extract size and extension +def fetch_image_file_size(img, base_url): + #If src is relative path construct full URL, if not it may be CDN URL + img_url = urljoin(base_url,img.get('src')) + try: + response = requests.head(img_url) + if response.status_code == 200: + return response.headers.get('Content-Length',None) + else: + print(f"Failed to retrieve file size for {img_url}") + return None + except InvalidSchema as e: + return None + finally: + return + +class ContentScrapingStrategy(ABC): @abstractmethod def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: pass @@ -114,7 +147,7 @@ class ContentScrappingStrategy(ABC): async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: pass -class WebScrappingStrategy(ContentScrappingStrategy): +class WebScrapingStrategy(ContentScrapingStrategy): def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: return self._get_content_of_website_optimized(url, html, is_async=False, **kwargs) @@ -126,9 +159,16 @@ class WebScrappingStrategy(ContentScrappingStrategy): if not html: return None - soup = BeautifulSoup(html, 'html.parser') + # soup = BeautifulSoup(html, 'html.parser') + soup = BeautifulSoup(html, 'lxml') body = soup.body + try: + meta = extract_metadata("", soup) + except Exception as e: + print('Error extracting metadata:', str(e)) + meta = {} + image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD) @@ -187,31 +227,7 @@ class WebScrappingStrategy(ContentScrappingStrategy): #Score an image for it's usefulness def score_image_for_usefulness(img, base_url, index, images_count): - # Function to parse image height/width value and units - def parse_dimension(dimension): - if dimension: - match = re.match(r"(\d+)(\D*)", dimension) - if match: - number = int(match.group(1)) - unit = match.group(2) or 'px' # Default unit is 'px' if not specified - return number, unit - return None, None - # Fetch image file metadata to extract size and extension - def fetch_image_file_size(img, base_url): - #If src is relative path construct full URL, if not it may be CDN URL - img_url = urljoin(base_url,img.get('src')) - try: - response = requests.head(img_url) - if response.status_code == 200: - return response.headers.get('Content-Length',None) - else: - print(f"Failed to retrieve file size for {img_url}") - return None - except InvalidSchema as e: - return None - finally: - return image_height = img.get('height') height_value, height_unit = parse_dimension(image_height) @@ -294,7 +310,6 @@ class WebScrappingStrategy(ContentScrappingStrategy): exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', []) exclude_social_media_domains = list(set(exclude_social_media_domains)) - try: if element.name == 'a' and element.get('href'): @@ -439,15 +454,7 @@ class WebScrappingStrategy(ContentScrappingStrategy): except Exception as e: print('Error processing element:', str(e)) return False - - #process images by filtering and extracting contextual text from the page - # imgs = body.find_all('img') - # media['images'] = [ - # result for result in - # (process_image(img, url, i, len(imgs)) for i, img in enumerate(imgs)) - # if result is not None - # ] - + process_element(body) # Update the links dictionary with unique links @@ -478,8 +485,9 @@ class WebScrappingStrategy(ContentScrappingStrategy): # Replace base64 data with empty string img['src'] = base64_pattern.sub('', src) + str_body = "" try: - str(body) + str_body = body.encode_contents().decode('utf-8') except Exception as e: # Reset body to the original HTML success = False @@ -504,11 +512,12 @@ class WebScrappingStrategy(ContentScrappingStrategy): # Append the error div to the body body.body.append(error_div) + str_body = body.encode_contents().decode('utf-8') print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.") - cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ') + cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ') try: h = CustomHTML2Text() @@ -518,15 +527,14 @@ class WebScrappingStrategy(ContentScrappingStrategy): markdown = h.handle(sanitize_html(cleaned_html)) markdown = markdown.replace(' ```', '```') - try: - meta = extract_metadata(html, soup) - except Exception as e: - print('Error extracting metadata:', str(e)) - meta = {} + - cleaner = ContentCleaningStrategy() - fit_html = cleaner.clean(cleaned_html) - fit_markdown = h.handle(fit_html) + fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content." + fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content." + if kwargs.get('fit_markdown', False): + cleaner = ContentCleaningStrategy() + fit_html = cleaner.clean(cleaned_html) + fit_markdown = h.handle(fit_html) cleaned_html = sanitize_html(cleaned_html) return { diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index d96f1ded..d8bd6992 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -736,46 +736,54 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: 'metadata': meta } -def extract_metadata(html, soup = None): +def extract_metadata(html, soup=None): metadata = {} - if not html: + if not html and not soup: + return {} + + if not soup: + soup = BeautifulSoup(html, 'lxml') + + head = soup.head + if not head: return metadata - # Parse HTML content with BeautifulSoup - if not soup: - soup = BeautifulSoup(html, 'html.parser') - # Title - title_tag = soup.find('title') - metadata['title'] = title_tag.string if title_tag else None + title_tag = head.find('title') + metadata['title'] = title_tag.string.strip() if title_tag and title_tag.string else None # Meta description - description_tag = soup.find('meta', attrs={'name': 'description'}) - metadata['description'] = description_tag['content'] if description_tag else None + description_tag = head.find('meta', attrs={'name': 'description'}) + metadata['description'] = description_tag.get('content', '').strip() if description_tag else None # Meta keywords - keywords_tag = soup.find('meta', attrs={'name': 'keywords'}) - metadata['keywords'] = keywords_tag['content'] if keywords_tag else None + keywords_tag = head.find('meta', attrs={'name': 'keywords'}) + metadata['keywords'] = keywords_tag.get('content', '').strip() if keywords_tag else None # Meta author - author_tag = soup.find('meta', attrs={'name': 'author'}) - metadata['author'] = author_tag['content'] if author_tag else None + author_tag = head.find('meta', attrs={'name': 'author'}) + metadata['author'] = author_tag.get('content', '').strip() if author_tag else None # Open Graph metadata - og_tags = soup.find_all('meta', attrs={'property': lambda value: value and value.startswith('og:')}) + og_tags = head.find_all('meta', attrs={'property': re.compile(r'^og:')}) for tag in og_tags: - property_name = tag['property'] - metadata[property_name] = tag['content'] + property_name = tag.get('property', '').strip() + content = tag.get('content', '').strip() + if property_name and content: + metadata[property_name] = content # Twitter Card metadata - twitter_tags = soup.find_all('meta', attrs={'name': lambda value: value and value.startswith('twitter:')}) + twitter_tags = head.find_all('meta', attrs={'name': re.compile(r'^twitter:')}) for tag in twitter_tags: - property_name = tag['name'] - metadata[property_name] = tag['content'] - + property_name = tag.get('name', '').strip() + content = tag.get('content', '').strip() + if property_name and content: + metadata[property_name] = content + return metadata + def extract_xml_tags(string): tags = re.findall(r'<(\w+)>', string) return list(set(tags)) diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 95af6c7a..c97a9cf4 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -10,7 +10,7 @@ from .extraction_strategy import * from .crawler_strategy import * from typing import List from concurrent.futures import ThreadPoolExecutor -from .content_scrapping_strategy import WebScrappingStrategy +from .content_scrapping_strategy import WebScrapingStrategy from .config import * import warnings import json @@ -182,7 +182,7 @@ class WebCrawler: # Extract content from HTML try: t1 = time.time() - scrapping_strategy = WebScrappingStrategy() + scrapping_strategy = WebScrapingStrategy() extra_params = {k: v for k, v in kwargs.items() if k not in ["only_text", "image_description_min_word_threshold"]} result = scrapping_strategy.scrap( url, diff --git a/docs/md_v2/basic/prefix-based-input.md b/docs/md_v2/basic/prefix-based-input.md new file mode 100644 index 00000000..42987a67 --- /dev/null +++ b/docs/md_v2/basic/prefix-based-input.md @@ -0,0 +1,235 @@ +# Prefix-Based Input Handling in Crawl4AI + +This guide will walk you through using the Crawl4AI library to crawl web pages, local HTML files, and raw HTML strings. We'll demonstrate these capabilities using a Wikipedia page as an example. + +## Table of Contents +- [Prefix-Based Input Handling in Crawl4AI](#prefix-based-input-handling-in-crawl4ai) + - [Table of Contents](#table-of-contents) + - [Crawling a Web URL](#crawling-a-web-url) + - [Crawling a Local HTML File](#crawling-a-local-html-file) + - [Crawling Raw HTML Content](#crawling-raw-html-content) + - [Complete Example](#complete-example) + - [**How It Works**](#how-it-works) + - [**Running the Example**](#running-the-example) + - [Conclusion](#conclusion) + +--- + + +### Crawling a Web URL + +To crawl a live web page, provide the URL starting with `http://` or `https://`. + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def crawl_web(): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun(url="https://en.wikipedia.org/wiki/apple", bypass_cache=True) + if result.success: + print("Markdown Content:") + print(result.markdown) + else: + print(f"Failed to crawl: {result.error_message}") + +asyncio.run(crawl_web()) +``` + +### Crawling a Local HTML File + +To crawl a local HTML file, prefix the file path with `file://`. + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def crawl_local_file(): + local_file_path = "/path/to/apple.html" # Replace with your file path + file_url = f"file://{local_file_path}" + + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun(url=file_url, bypass_cache=True) + if result.success: + print("Markdown Content from Local File:") + print(result.markdown) + else: + print(f"Failed to crawl local file: {result.error_message}") + +asyncio.run(crawl_local_file()) +``` + +### Crawling Raw HTML Content + +To crawl raw HTML content, prefix the HTML string with `raw:`. + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def crawl_raw_html(): + raw_html = "

Hello, World!

" + raw_html_url = f"raw:{raw_html}" + + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun(url=raw_html_url, bypass_cache=True) + if result.success: + print("Markdown Content from Raw HTML:") + print(result.markdown) + else: + print(f"Failed to crawl raw HTML: {result.error_message}") + +asyncio.run(crawl_raw_html()) +``` + +--- + +## Complete Example + +Below is a comprehensive script that: +1. **Crawls the Wikipedia page for "Apple".** +2. **Saves the HTML content to a local file (`apple.html`).** +3. **Crawls the local HTML file and verifies the markdown length matches the original crawl.** +4. **Crawls the raw HTML content from the saved file and verifies consistency.** + +```python +import os +import sys +import asyncio +from pathlib import Path + +# Adjust the parent directory to include the crawl4ai module +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) + +from crawl4ai import AsyncWebCrawler + +async def main(): + # Define the URL to crawl + wikipedia_url = "https://en.wikipedia.org/wiki/apple" + + # Define the path to save the HTML file + # Save the file in the same directory as the script + script_dir = Path(__file__).parent + html_file_path = script_dir / "apple.html" + + async with AsyncWebCrawler(verbose=True) as crawler: + print("\n=== Step 1: Crawling the Wikipedia URL ===") + # Crawl the Wikipedia URL + result = await crawler.arun(url=wikipedia_url, bypass_cache=True) + + # Check if crawling was successful + if not result.success: + print(f"Failed to crawl {wikipedia_url}: {result.error_message}") + return + + # Save the HTML content to a local file + with open(html_file_path, 'w', encoding='utf-8') as f: + f.write(result.html) + print(f"Saved HTML content to {html_file_path}") + + # Store the length of the generated markdown + web_crawl_length = len(result.markdown) + print(f"Length of markdown from web crawl: {web_crawl_length}\n") + + print("=== Step 2: Crawling from the Local HTML File ===") + # Construct the file URL with 'file://' prefix + file_url = f"file://{html_file_path.resolve()}" + + # Crawl the local HTML file + local_result = await crawler.arun(url=file_url, bypass_cache=True) + + # Check if crawling was successful + if not local_result.success: + print(f"Failed to crawl local file {file_url}: {local_result.error_message}") + return + + # Store the length of the generated markdown from local file + local_crawl_length = len(local_result.markdown) + print(f"Length of markdown from local file crawl: {local_crawl_length}") + + # Compare the lengths + assert web_crawl_length == local_crawl_length, ( + f"Markdown length mismatch: Web crawl ({web_crawl_length}) != Local file crawl ({local_crawl_length})" + ) + print("✅ Markdown length matches between web crawl and local file crawl.\n") + + print("=== Step 3: Crawling Using Raw HTML Content ===") + # Read the HTML content from the saved file + with open(html_file_path, 'r', encoding='utf-8') as f: + raw_html_content = f.read() + + # Prefix the raw HTML content with 'raw:' + raw_html_url = f"raw:{raw_html_content}" + + # Crawl using the raw HTML content + raw_result = await crawler.arun(url=raw_html_url, bypass_cache=True) + + # Check if crawling was successful + if not raw_result.success: + print(f"Failed to crawl raw HTML content: {raw_result.error_message}") + return + + # Store the length of the generated markdown from raw HTML + raw_crawl_length = len(raw_result.markdown) + print(f"Length of markdown from raw HTML crawl: {raw_crawl_length}") + + # Compare the lengths + assert web_crawl_length == raw_crawl_length, ( + f"Markdown length mismatch: Web crawl ({web_crawl_length}) != Raw HTML crawl ({raw_crawl_length})" + ) + print("✅ Markdown length matches between web crawl and raw HTML crawl.\n") + + print("All tests passed successfully!") + + # Clean up by removing the saved HTML file + if html_file_path.exists(): + os.remove(html_file_path) + print(f"Removed the saved HTML file: {html_file_path}") + +# Run the main function +if __name__ == "__main__": + asyncio.run(main()) +``` + +### **How It Works** + +1. **Step 1: Crawl the Web URL** + - Crawls `https://en.wikipedia.org/wiki/apple`. + - Saves the HTML content to `apple.html`. + - Records the length of the generated markdown. + +2. **Step 2: Crawl from the Local HTML File** + - Uses the `file://` prefix to crawl `apple.html`. + - Ensures the markdown length matches the original web crawl. + +3. **Step 3: Crawl Using Raw HTML Content** + - Reads the HTML from `apple.html`. + - Prefixes it with `raw:` and crawls. + - Verifies the markdown length matches the previous results. + +4. **Cleanup** + - Deletes the `apple.html` file after testing. + +### **Running the Example** + +1. **Save the Script:** + - Save the above code as `test_crawl4ai.py` in your project directory. + +2. **Execute the Script:** + - Run the script using: + ```bash + python test_crawl4ai.py + ``` + +3. **Observe the Output:** + - The script will print logs detailing each step. + - Assertions ensure consistency across different crawling methods. + - Upon success, it confirms that all markdown lengths match. + +--- + +## Conclusion + +With the new prefix-based input handling in **Crawl4AI**, you can effortlessly crawl web URLs, local HTML files, and raw HTML strings using a unified `url` parameter. This enhancement simplifies the API usage and provides greater flexibility for diverse crawling scenarios. + diff --git a/tests/async/sample_wikipedia.html b/tests/async/sample_wikipedia.html new file mode 100644 index 00000000..a22b3e3f --- /dev/null +++ b/tests/async/sample_wikipedia.html @@ -0,0 +1,2179 @@ + + +Apple - Wikipedia + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Jump to content +
+
+
+ + + + +
+
+ + + + + +
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+ +
+
+
+
+
+ +

Apple

+ +
+ + +
+ +
+ + + +
+ +
+
+
+
+
+
+ +
+
+ + + +
+
+
+
+
+ + +
+
+
+
+
+
This is a good article. Click here for more information.
+
Page semi-protected
+
+ +
From Wikipedia, the free encyclopedia
+
+
+ + +
+ + +

+ + + +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Apple +
+
'Cripps Pink' apples +
+
Flowers +
Scientific classification Edit this classification +
Kingdom: +Plantae +
Clade: +Tracheophytes +
Clade: +Angiosperms +
Clade: +Eudicots +
Clade: +Rosids +
Order: +Rosales +
Family: +Rosaceae +
Genus: +Malus +
Species: +
M. domestica
+
Binomial name +
Malus domestica
+
Synonyms[1][2] +
+
  • M. communis Desf., 1768
  • +
  • M. pumila Mil.
  • +
  • M. frutescens Medik.
  • +
  • M. paradisiaca (L.) Medikus
  • +
  • M. sylvestris Mil.
  • +
  • Pyrus malus L.
  • +
  • Pyrus malus var. paradisiaca L.
  • +
  • Pyrus dioica Moench
+
+

An apple is a round, edible fruit produced by an apple tree (Malus spp., among them the domestic or orchard apple; Malus domestica). Apple trees are cultivated worldwide and are the most widely grown species in the genus Malus. The tree originated in Central Asia, where its wild ancestor, Malus sieversii, is still found. Apples have been grown for thousands of years in Eurasia and were introduced to North America by European colonists. Apples have religious and mythological significance in many cultures, including Norse, Greek, and European Christian tradition. +

Apples grown from seed tend to be very different from those of their parents, and the resultant fruit frequently lacks desired characteristics. For commercial purposes, including botanical evaluation, apple cultivars are propagated by clonal grafting onto rootstocks. Apple trees grown without rootstocks tend to be larger and much slower to fruit after planting. Rootstocks are used to control the speed of growth and the size of the resulting tree, allowing for easier harvesting. +

There are more than 7,500 cultivars of apples. Different cultivars are bred for various tastes and uses, including cooking, eating raw, and cider or apple juice production. Trees and fruit are prone to fungal, bacterial, and pest problems, which can be controlled by a number of organic and non-organic means. In 2010, the fruit's genome was sequenced as part of research on disease control and selective breeding in apple production. +

+ +

Etymology

+

The word apple, whose Old English ancestor is æppel, is descended from the Proto-Germanic noun *aplaz, descended in turn from Proto-Indo-European *h₂ébōl.[3] As late as the 17th century, the word also functioned as a generic term for all fruit, including nuts. This can be compared to the 14th-century Middle English expression appel of paradis, meaning a banana.[4] +

+

Description

+

The apple is a deciduous tree, generally standing 2 to 4.5 metres (6 to 15 feet) tall in cultivation and up to 15 m (49 ft) in the wild, though more typically 2 to 10 m (6.5 to 33 ft).[5][1] When cultivated, the size, shape and branch density are determined by rootstock selection and trimming method.[5] Apple trees may naturally have a rounded to erect crown with a dense canopy of leaves.[6] The bark of the trunk is dark gray or gray-brown, but young branches are reddish or dark-brown with a smooth texture.[1][7] When young twigs are covered in very fine downy hairs and become hairless as they become older.[7] +

The buds are egg-shaped and dark red or purple in color; they range in size from 3 to 5 millimeters, but are usually less than 4 mm. The bud scales have very hairy edges. When emerging from the buds, the leaves are convolute, meaning that their edges overlap each other.[1] Leaves can be simple ovals (elliptic), medium or wide in width, somewhat egg-shaped with the wider portion toward their base (ovate), or even with sides that are more parallel to each other instead of curved (oblong) with a narrow pointed end.[7][1] The edges have broadly-angled teeth, but do not have lobes. The top surface of the leaves are glabrescent, almost hairless, while the undersides are densely covered in fine hairs.[1] The leaves are attached alternately by short leaf stems 1-to-3.5 cm (12-to-1+12 in) long.[6][1] +

Blossoms are produced in spring simultaneously with the budding of the leaves and are produced on spurs and some long shoots.[5] When the flower buds first begin to open the petals are rose-pink and fade to white or light pink when fully open with each flower 3-to-4-centimeter (1-to-1+12-inch) in diameter.[1] The five-petaled flowers are group in an inflorescence consisting of a cyme with 3–7 flowers.[8] The central flower of the inflorescence is called the "king bloom"; it opens first and can develop a larger fruit.[6] Open apple blossoms are damaged by even brief exposures to temperatures −2 °C (28 °F) or less, although the overwintering wood and buds are hardy down to −40 °C (−40 °F).[8] +

+ +

Fruit

+

The fruit is a pome that matures in late summer or autumn.[1] The true fruits or carpels are the harder interior chambers inside the apple's core. There are usually five carpels inside an apple, but there may be as few as three. Each of the chambers contains one or two seeds.[9] The edible flesh is formed from the receptacle at the base of the flower.[10] +

+ +

The seeds are egg- to pear-shaped and may be colored from light brown or tan to a very dark brown, often with red shades or even purplish-black. They may have a blunt or sharp point.[11] The five sepals remain attached and stand out from the surface of the apple.[1] +

The size of the fruit varies widely between cultivars, but generally has a diameter between 2.5 and 12 cm (1 and 5 in).[7] The shape is quite variable and may be nearly round, elongated, conical, or short and wide.[12] +

The groundcolor of ripe apples is yellow, green, yellow-green or whitish yellow. The overcolor of ripe apples can be orange-red, pink-red, red, purple-red or brown-red. The overcolor amount can be 0–100%.[13] The skin may be wholly or partly russeted, making it rough and brown. The skin is covered in a protective layer of epicuticular wax.[14] The skin may also be marked with scattered dots.[1] The flesh is generally pale yellowish-white, though it can be pink, yellow or green.[13] +

+ +

Chemistry

+

Important volatile compounds in apples that contribute to their scent and flavour include acetaldehyde, ethyl acetate, 1-butanal, ethanol, 2-methylbutanal, 3-methylbutanal, ethyl propionate, ethyl 2-methylpropionate, ethyl butyrate, ethyl 2-methyl butyrate, hexanal, 1-butanol, 3-methylbutyl acetate, 2-methylbutyl acetate, 1-propyl butyrate, ethyl pentanoate, amyl acetate, 2-methyl-1-butanol, trans-2-hexenal, ethyl hexanoate, hexanol.[15][16] +

+

Taxonomy

+

The apple as a species has more than 100 alternative scientific names, or synonyms.[17] In modern times, Malus pumila and Malus domestica are the two main names in use. M. pumila is the older name, but M. domestica has become much more commonly used starting in the 21st century, especially in the western world. Two proposals were made to make M. domestica a conserved name: the earlier proposal was voted down by the Committee for Vascular Plants of the IAPT in 2014, but in April 2017 the Committee decided, with a narrow majority, that the newly popular name should be conserved.[18] The General Committee of the IAPT decided in June 2017 to approve this change, officially conserving M. domestica.[19] Nevertheless, some works published after 2017 still use M. pumila as the correct name, under an alternate taxonomy.[2] +

When first classified by Linnaeus in 1753, the pears, apples, and quinces were combined into one genus that he named Pyrus and he named the apple as Pyrus malus. This was widely accepted, however the botanist Philip Miller published an alternate classification in The Gardeners Dictionary with the apple species separated from Pyrus in 1754. He did not clearly indicate that by Malus pumila he meant the domesticated apple. Nonetheless, it was used as such by many botanists. When Moritz Balthasar Borkhausen published his scientific description of the apple in 1803 it may have been a new combination of P. malus var. domestica, but this was not directly referenced by Borkhausen.[17] The earliest use of var. domestica for the apple was by Georg Adolf Suckow in 1786.[2] +

+

Genome

+ +

Apples are diploid, with two sets of chromosomes per cell (though triploid cultivars, with three sets, are not uncommon), have 17 chromosomes and an estimated genome size of approximately 650 Mb. Several whole genome sequences have been completed and made available. The first one in 2010 was based on the diploid cultivar 'Golden Delicious'.[20] However, this first whole genome sequence contained several errors,[21] in part owing to the high degree of heterozygosity in diploid apples which, in combination with an ancient genome duplication, complicated the assembly. Recently, double- and trihaploid individuals have been sequenced, yielding whole genome sequences of higher quality.[22][23] +

The first whole genome assembly was estimated to contain around 57,000 genes,[20] though the more recent genome sequences support estimates between 42,000 and 44,700 protein-coding genes.[22][23] The availability of whole genome sequences has provided evidence that the wild ancestor of the cultivated apple most likely is Malus sieversii. Re-sequencing of multiple accessions has supported this, while also suggesting extensive introgression from Malus sylvestris following domestication.[24] +

+

Cultivation

+

History

+
Map of the origins of the cultivated apple. The wild origin is in Kazakhstan; hybridisations and repeated domestications followed, modifying many attributes of the fruit.[24]
+
color photograph of a hand holding a red apple
Wild Malus sieversii apple in Kazakhstan
+

Central Asia is generally considered the center of origin for apples due to the genetic variability in specimens there.[25] The wild ancestor of Malus domestica was Malus sieversii, found growing wild in the mountains of Central Asia in southern Kazakhstan, Kyrgyzstan, Tajikistan, and northwestern China.[5][26] Cultivation of the species, most likely beginning on the forested flanks of the Tian Shan mountains, progressed over a long period of time and permitted secondary introgression of genes from other species into the open-pollinated seeds. Significant exchange with Malus sylvestris, the crabapple, resulted in populations of apples being more related to crabapples than to the more morphologically similar progenitor Malus sieversii. In strains without recent admixture the contribution of the latter predominates.[27][28][29] +

The apple is thought to have been domesticated 4,000–10,000 years ago in the Tian Shan mountains, and then to have travelled along the Silk Road to Europe, with hybridization and introgression of wild crabapples from Siberia (M. baccata), the Caucasus (M. orientalis), and Europe (M. sylvestris). Only the M. sieversii trees growing on the western side of the Tian Shan mountains contributed genetically to the domesticated apple, not the isolated population on the eastern side.[24] +

Chinese soft apples, such as M. asiatica and M. prunifolia, have been cultivated as dessert apples for more than 2,000 years in China. These are thought to be hybrids between M. baccata and M. sieversii in Kazakhstan.[24] +

Among the traits selected for by human growers are size, fruit acidity, color, firmness, and soluble sugar. Unusually for domesticated fruits, the wild M. sieversii origin is only slightly smaller than the modern domesticated apple.[24] +

At the Sammardenchia-Cueis site near Udine in Northeastern Italy, seeds from some form of apples have been found in material carbon dated to between 6570 and 5684 BCE.[30] Genetic analysis has not yet been successfully used to determine whether such ancient apples were wild Malus sylvestris or Malus domesticus containing Malus sieversii ancestry. It is hard to distinguish in the archeological record between foraged wild apples and apple plantations.[31] +

There is indirect evidence of apple cultivation in the third millennium BCE in the Middle East.[31] There is direct evidence, apple cores, dated to the 10th century BCE from a Judean site between the Sinai and Negev. +[32] There was substantial apple production in European classical antiquity, and grafting was certainly known then.[31] Grafting is an essential part of modern domesticated apple production, to be able to propagate the best cultivars; it is unclear when apple tree grafting was invented.[31] +

+ +

The Roman writer Pliny the Elder describes a method of storage for apples from his time in the 1st century. He says they should be placed in a room with good air circulation from a north facing window on a bed of straw, chaff, or mats with windfalls kept separately.[33] Though methods like this will extend the availabity of reasonably fresh apples, without refrigeration their lifespan is limited. Even sturdy winter apple varieties will only keep well until December in cool climates.[34] For longer storage medieval Europeans strung up cored and peeled apples to dry, either whole or sliced into rings.[35] +

Of the many Old World plants that the Spanish introduced to Chiloé Archipelago in the 16th century, apple trees became particularly well adapted.[36] Apples were introduced to North America by colonists in the 17th century,[5] and the first named apple cultivar was introduced in Boston by Reverend William Blaxton in 1640.[37] The only apples native to North America are crab apples.[38] +

Apple cultivars brought as seed from Europe were spread along Native American trade routes, as well as being cultivated on colonial farms. An 1845 United States apples nursery catalogue sold 350 of the "best" cultivars, showing the proliferation of new North American cultivars by the early 19th century.[38] In the 20th century, irrigation projects in Eastern Washington began and allowed the development of the multibillion-dollar fruit industry, of which the apple is the leading product.[5] +

Until the 20th century, farmers stored apples in frostproof cellars during the winter for their own use or for sale. Improved transportation of fresh apples by train and road replaced the necessity for storage.[39][40] Controlled atmosphere facilities are used to keep apples fresh year-round. Controlled atmosphere facilities use high humidity, low oxygen, and controlled carbon dioxide levels to maintain fruit freshness. They were first researched at Cambridge University in the 1920s and first used in the United States in the 1950s.[41] +

+

Breeding

+ +
An apple tree in Germany
+

Many apples grow readily from seeds. However, apples must be propagated asexually to obtain cuttings with the characteristics of the parent. This is because seedling apples are "extreme heterozygotes". Rather than resembling their parents, seedlings are all different from each other and from their parents.[42] Triploid cultivars have an additional reproductive barrier in that three sets of chromosomes cannot be divided evenly during meiosis, yielding unequal segregation of the chromosomes (aneuploids). Even in the case when a triploid plant can produce a seed (apples are an example), it occurs infrequently, and seedlings rarely survive.[43] +

Because apples are not true breeders when planted as seeds, propagation usually involves grafting of cuttings. The rootstock used for the bottom of the graft can be selected to produce trees of a large variety of sizes, as well as changing the winter hardiness, insect and disease resistance, and soil preference of the resulting tree. Dwarf rootstocks can be used to produce very small trees (less than 3.0 m or 10 ft high at maturity), which bear fruit many years earlier in their life cycle than full size trees, and are easier to harvest.[44] +

Dwarf rootstocks for apple trees can be traced as far back as 300 BCE, to the area of Persia and Asia Minor. Alexander the Great sent samples of dwarf apple trees to Aristotle's Lyceum. Dwarf rootstocks became common by the 15th century and later went through several cycles of popularity and decline throughout the world.[45] The majority of the rootstocks used to control size in apples were developed in England in the early 1900s. The East Malling Research Station conducted extensive research into rootstocks, and their rootstocks are given an "M" prefix to designate their origin. Rootstocks marked with an "MM" prefix are Malling-series cultivars later crossed with trees of 'Northern Spy' in Merton, England.[46] +

Most new apple cultivars originate as seedlings, which either arise by chance or are bred by deliberately crossing cultivars with promising characteristics.[47] The words "seedling", "pippin", and "kernel" in the name of an apple cultivar suggest that it originated as a seedling. Apples can also form bud sports (mutations on a single branch). Some bud sports turn out to be improved strains of the parent cultivar. Some differ sufficiently from the parent tree to be considered new cultivars.[48] +

Apples have been acclimatized in Ecuador at very high altitudes, where they can often, with the needed factors, provide crops twice per year because of constant temperate conditions year-round.[49] +

+

Pollination

+ +
Apple blossom from an old Ayrshire cultivar
+
An orchard mason bee on an apple bloom in British Columbia, Canada
+

Apples are self-incompatible; they must cross-pollinate to develop fruit. During the flowering each season, apple growers often utilize pollinators to carry pollen. Honey bees are most commonly used. Orchard mason bees are also used as supplemental pollinators in commercial orchards. Bumblebee queens are sometimes present in orchards, but not usually in sufficient number to be significant pollinators.[48][50] +

Cultivars are sometimes classified by the day of peak bloom in the average 30-day blossom period, with pollinizers selected from cultivars within a 6-day overlap period. There are four to seven pollination groups in apples, depending on climate: +

+
  • Group A – Early flowering, 1 to 3 May in England ('Gravenstein', 'Red Astrachan')
  • +
  • Group B – 4 to 7 May ('Idared', 'McIntosh')
  • +
  • Group C – Mid-season flowering, 8 to 11 May ('Granny Smith', 'Cox's Orange Pippin')
  • +
  • Group D – Mid/late season flowering, 12 to 15 May ('Golden Delicious', 'Calville blanc d'hiver')
  • +
  • Group E – Late flowering, 16 to 18 May ('Braeburn', 'Reinette d'Orléans')
  • +
  • Group F – 19 to 23 May ('Suntan')
  • +
  • Group H – 24 to 28 May ('Court-Pendu Gris' – also called Court-Pendu plat)
+

One cultivar can be pollinated by a compatible cultivar from the same group or close (A with A, or A with B, but not A with C or D).[51] +

+

Maturation and harvest

+ +
L. K. Relander, the former President of Finland, with his family picking apples in the 1930s
+

Cultivars vary in their yield and the ultimate size of the tree, even when grown on the same rootstock. Some cultivars, if left unpruned, grow very large—letting them bear more fruit, but making harvesting more difficult. Depending on tree density (number of trees planted per unit surface area), mature trees typically bear 40–200 kg (90–440 lb) of apples each year, though productivity can be close to zero in poor years. Apples are harvested using three-point ladders that are designed to fit amongst the branches. Trees grafted on dwarfing rootstocks bear about 10–80 kg (20–180 lb) of fruit per year.[48] +

Some farms with apple orchards open them to the public so consumers can pick their own apples.[52] +

Crops ripen at different times of the year according to the cultivar. Cultivar that yield their crop in the summer include 'Sweet Bough' and 'Duchess'; fall producers include 'Blenheim'; winter producers include 'King', 'Swayzie', and 'Tolman Sweet'.[38] +

+

Storage

+
Different apple cultivars in a wholesale food market
+

Commercially, apples can be stored for months in controlled atmosphere chambers. Apples are commonly stored in chambers with lowered concentrations of oxygen to reduce respiration and slow softening and other changes if the fruit is already fully ripe. The gas ethylene is used by plants as a hormone which promotes ripening, decreasing the time an apple can be stored. For storage longer than about six months the apples are picked earlier, before full ripeness, when ethylene production by the fruit is low. However, in many varieties this increases their sensitivity to carbon dioxide, which also must be controlled.[53] +

For home storage, most culitvars of apple can be stored for three weeks in a pantry and four to six weeks from the date of purchase in a refrigerator that maintains 4 to 0 °C (39 to 32 °F).[54][55] Some varieties of apples (e.g. 'Granny Smith' and 'Fuji') have more than three times the storage life of others.[56] +

Non-organic apples may be sprayed with a substance 1-methylcyclopropene blocking the apples' ethylene receptors, temporarily preventing them from ripening.[57] +

+

Pests and diseases

+ +
Codling moth larva tunnelling inside an apple
+

Apple trees are susceptible to fungal and bacterial diseases, and to damage by insect pests. Many commercial orchards pursue a program of chemical sprays to maintain high fruit quality, tree health, and high yields. These prohibit the use of synthetic pesticides, though some older pesticides are allowed. Organic methods include, for instance, introducing its natural predator to reduce the population of a particular pest. +

A wide range of pests and diseases can affect the plant. Three of the more common diseases or pests are mildew, aphids, and apple scab. +

+
  • Mildew is characterized by light grey powdery patches appearing on the leaves, shoots and flowers, normally in spring. The flowers turn a creamy yellow color and do not develop correctly. This can be treated similarly to Botrytis—eliminating the conditions that caused the disease and burning the infected plants are among recommended actions.[58]
  • +
  • Aphids are small insects with sucking mouthparts. Five species of aphids commonly attack apples: apple grain aphid, rosy apple aphid, apple aphid, spirea aphid, and the woolly apple aphid. The aphid species can be identified by color, time of year, and by differences in the cornicles (small paired projections from their rear).[59] Aphids feed on foliage using needle-like mouth parts to suck out plant juices. When present in high numbers, certain species reduce tree growth and vigor.[60]
  • +
  • Apple scab: Apple scab causes leaves to develop olive-brown spots with a velvety texture that later turn brown and become cork-like in texture. The disease also affects the fruit, which also develops similar brown spots with velvety or cork-like textures. Apple scab is spread through fungus growing in old apple leaves on the ground and spreads during warm spring weather to infect the new year's growth.[61]
+

Among the most serious disease problems is a bacterial disease called fireblight, and three fungal diseases: Gymnosporangium rust, black spot,[62] and bitter rot.[63] Codling moths, and the apple maggots of fruit flies, cause serious damage to apple fruits, making them unsaleable. Young apple trees are also prone to mammal pests like mice and deer, which feed on the soft bark of the trees, especially in winter.[61] The larvae of the apple clearwing moth (red-belted clearwing) burrow through the bark and into the phloem of apple trees, potentially causing significant damage.[64] +

+

Cultivars

+ +
An assortment of apple cultivars
+

There are more than 7,500 known cultivars (cultivated varieties) of apples.[65] Cultivars vary in their yield and the ultimate size of the tree, even when grown on the same rootstock.[66] Different cultivars are available for temperate and subtropical climates. The UK's National Fruit Collection, which is the responsibility of the Department of Environment, Food, and Rural Affairs, includes a collection of over 2,000 cultivars of apple tree in Kent.[67] The University of Reading, which is responsible for developing the UK national collection database, provides access to search the national collection. The University of Reading's work is part of the European Cooperative Programme for Plant Genetic Resources of which there are 38 countries participating in the Malus/Pyrus work group.[68] +

The UK's national fruit collection database contains much information on the characteristics and origin of many apples, including alternative names for what is essentially the same "genetic" apple cultivar. Most of these cultivars are bred for eating fresh (dessert apples), though some are cultivated specifically for cooking (cooking apples) or producing cider. Cider apples are typically too tart and astringent to eat fresh, but they give the beverage a rich flavor that dessert apples cannot.[69] +

In the United States there are many apple breeding programs associated with universities. Cornell University has had a program operating since 1880 in Geneva, New York. Among their recent well known apples is the 'SnapDragon' cultivar released in 2013. In the west Washington State University started a program to support their apple industry in 1994 and released the 'Cosmic Crisp' cultivar in 2017. The third most grown apple cultivar in the United States is the 'Honeycrisp', released by the University of Minnesota program in 1991.[70] Unusually for a popular cultivar, the 'Honeycrisp' is not directly related to another popular apple cultivar but instead to two unsuccessful cultivars.[71] In Europe there are also many breeding programs such as the Julius Kühn-Institut, the German federal research center for cultivated plants.[72] +

Commercially popular apple cultivars are soft but crisp. Other desirable qualities in modern commercial apple breeding are a colorful skin, absence of russeting, ease of shipping, lengthy storage ability, high yields, disease resistance, common apple shape, and developed flavor.[66] Modern apples are generally sweeter than older cultivars, as popular tastes in apples have varied over time. Most North Americans and Europeans favor sweet, subacid apples, but tart apples have a strong minority following.[73] Extremely sweet apples with barely any acid flavor are popular in Asia,[73] especially the Indian subcontinent.[69] +

+
Less common apple cultivars from an orchard in Italy
+

Old cultivars are often oddly shaped, russeted, and grow in a variety of textures and colors. Some find them to have better flavor than modern cultivars, but they may have other problems that make them commercially unviable—low yield, disease susceptibility, poor tolerance for storage or transport, or just being the "wrong" size.[74] A few old cultivars are still produced on a large scale, but many have been preserved by home gardeners and farmers that sell directly to local markets. Many unusual and locally important cultivars with their own unique taste and appearance exist; apple conservation campaigns have sprung up around the world to preserve such local cultivars from extinction. In the United Kingdom, old cultivars such as 'Cox's Orange Pippin' and 'Egremont Russet' are still commercially important even though by modern standards they are low yielding and susceptible to disease.[5] +

+

Production

+ + + + + + + + + + + + + + + + + + + + + + + +
Apple production
+

2022, millions of tonnes
+

+
 China47.6 +
 United States4.8 +
 Turkey4.4 +
 Poland4.3 +
 India2.6 +
World95.8 +
Source: FAOSTAT of the United Nations[75] +
+

World production of apples in 2022 was 96 million tonnes, with China producing 50% of the total (table).[75] Secondary producers were the United States, Turkey, and Poland.[75] +

+

Toxicity

+

Amygdalin

+

Apple seeds contain small amounts of amygdalin, a sugar and cyanide compound known as a cyanogenic glycoside. Ingesting small amounts of apple seeds causes no ill effects, but consumption of extremely large doses can cause adverse reactions. It may take several hours before the poison takes effect, as cyanogenic glycosides must be hydrolyzed before the cyanide ion is released.[76] The U.S. National Library of Medicine's Hazardous Substances Data Bank records no cases of amygdalin poisoning from consuming apple seeds.[77] +

+

Allergy

+

One form of apple allergy, often found in northern Europe, is called birch-apple syndrome and is found in people who are also allergic to birch pollen.[78] Allergic reactions are triggered by a protein in apples that is similar to birch pollen, and people affected by this protein can also develop allergies to other fruits, nuts, and vegetables. Reactions, which entail oral allergy syndrome (OAS), generally involve itching and inflammation of the mouth and throat,[78] but in rare cases can also include life-threatening anaphylaxis.[79] This reaction only occurs when raw fruit is consumed—the allergen is neutralized in the cooking process. The variety of apple, maturity and storage conditions can change the amount of allergen present in individual fruits. Long storage times can increase the amount of proteins that cause birch-apple syndrome.[78] +

In other areas, such as the Mediterranean, some individuals have adverse reactions to apples because of their similarity to peaches.[78] This form of apple allergy also includes OAS, but often has more severe symptoms, such as vomiting, abdominal pain and urticaria, and can be life-threatening. Individuals with this form of allergy can also develop reactions to other fruits and nuts. Cooking does not break down the protein causing this particular reaction, so affected individuals cannot eat raw or cooked apples. Freshly harvested, over-ripe fruits tend to have the highest levels of the protein that causes this reaction.[78] +

Breeding efforts have yet to produce a hypoallergenic fruit suitable for either of the two forms of apple allergy.[78] +

+

Uses

+ +

Nutrition

+
+ +
Apples, with skin (edible parts)
Nutritional value per 100 g (3.5 oz)
Energy218 kJ (52 kcal)
13.81 g
Sugars10.39
Dietary fiber2.4 g
+
0.17 g
+
0.26 g
+ + + + +
Vitamins and minerals
+
VitaminsQuantity
%DV
Vitamin A equiv.
0%
3 μg
0%
27 μg
29 μg
Thiamine (B1)
1%
0.017 mg
Riboflavin (B2)
2%
0.026 mg
Niacin (B3)
1%
0.091 mg
Pantothenic acid (B5)
1%
0.061 mg
Vitamin B6
2%
0.041 mg
Folate (B9)
1%
3 μg
Vitamin C
5%
4.6 mg
Vitamin E
1%
0.18 mg
Vitamin K
2%
2.2 μg
+
MineralsQuantity
%DV
Calcium
0%
6 mg
Iron
1%
0.12 mg
Magnesium
1%
5 mg
Manganese
2%
0.035 mg
Phosphorus
1%
11 mg
Potassium
4%
107 mg
Sodium
0%
1 mg
Zinc
0%
0.04 mg
+
Other constituentsQuantity
Water85.56 g
+

Percentages estimated using US recommendations for adults,[80] except for potassium, which is estimated based on expert recommendation from the National Academies.[81]
+
+

A raw apple is 86% water and 14% carbohydrates, with negligible content of fat and protein (table). A reference serving of a raw apple with skin weighing 100 g (3.5 oz) provides 52 calories and a moderate content of dietary fiber (table). Otherwise, there is low content of micronutrients, with the Daily Values of all falling below 10% (table). +

+

Culinary

+ +
Machine for paring, coring, and slicing apples, from Henry B. Scammell's 1897 handbook Cyclopedia of Valuable Receipts
+

Apples varieties can be grouped as cooking apples, eating apples, and cider apples, the last so astringent as to be "almost inedible".[82] Apples are consumed as juice, raw in salads, baked in pies, cooked into sauces and apple butter, or baked.[83] They are sometimes used as an ingredient in savory foods, such as sausage and stuffing.[84] +

Several techniques are used to preserve apples and apple products. Traditional methods include drying and making apple butter.[82] Juice and cider are produced commercially; cider is a significant industry in regions such as the West of England and Normandy.[82] +

A toffee apple (UK) or caramel apple (US) is a confection made by coating an apple in hot toffee or caramel candy respectively and allowing it to cool.[85][8] Apples and honey are a ritual food pairing eaten during the Jewish New Year of Rosh Hashanah.[86] +

Apples are an important ingredient in many desserts, such as pies, crumbles, and cakes. When cooked, some apple cultivars easily form a puree known as apple sauce, which can be cooked down to form a preserve, apple butter. They are often baked or stewed, and are cooked in some meat dishes.[82] +

+ +

Apples are milled or pressed to produce apple juice, which may be drunk unfiltered (called apple cider in North America), or filtered. Filtered juice is often concentrated and frozen, then reconstituted later and consumed. Apple juice can be fermented to make cider (called hard cider in North America), ciderkin, and vinegar.[8] Through distillation, various alcoholic beverages can be produced, such as applejack, Calvados, and apple brandy.[8][87] +

+

Organic production

+

Organic apples are commonly produced in the United States.[88] Due to infestations by key insects and diseases, organic production is difficult in Europe.[89] The use of pesticides containing chemicals, such as sulfur, copper, microorganisms, viruses, clay powders, or plant extracts (pyrethrum, neem) has been approved by the EU Organic Standing Committee to improve organic yield and quality.[89] A light coating of kaolin, which forms a physical barrier to some pests, also may help prevent apple sun scalding.[48] +

+

Non-browning apples

+

Apple skins and seeds contain polyphenols.[90] These are oxidised by the enzyme polyphenol oxidase, which causes browning in sliced or bruised apples, by catalyzing the oxidation of phenolic compounds to o-quinones, a browning factor.[91] Browning reduces apple taste, color, and food value. Arctic apples, a non-browning group of apples introduced to the United States market in 2019, have been genetically modified to silence the expression of polyphenol oxidase, thereby delaying a browning effect and improving apple eating quality.[92][93] The US Food and Drug Administration in 2015, and Canadian Food Inspection Agency in 2017, determined that Arctic apples are as safe and nutritious as conventional apples.[94][95] +

+

Other products

+

Apple seed oil is obtained by pressing apple seeds for manufacturing cosmetics.[96] +

+

In culture

+ +

Germanic paganism

+
Illustration of girl in a red dress, holding 3 candles in one hand and a basket of apples in the other
"Brita as Iduna" (1901) by Carl Larsson
+

In Norse mythology, the goddess Iðunn is portrayed in the Prose Edda (written in the 13th century by Snorri Sturluson) as providing apples to the gods that give them eternal youthfulness. The English scholar H. R. Ellis Davidson links apples to religious practices in Germanic paganism, from which Norse paganism developed. She points out that buckets of apples were found in the Oseberg ship burial site in Norway, that fruit and nuts (Iðunn having been described as being transformed into a nut in Skáldskaparmál) have been found in the early graves of the Germanic peoples in England and elsewhere on the continent of Europe, which may have had a symbolic meaning, and that nuts are still a recognized symbol of fertility in southwest England.[97] +

Davidson notes a connection between apples and the Vanir, a tribe of gods associated with fertility in Norse mythology, citing an instance of eleven "golden apples" being given to woo the beautiful Gerðr by Skírnir, who was acting as messenger for the major Vanir god Freyr in stanzas 19 and 20 of Skírnismál. Davidson also notes a further connection between fertility and apples in Norse mythology in chapter 2 of the Völsunga saga: when the major goddess Frigg sends King Rerir an apple after he prays to Odin for a child, Frigg's messenger (in the guise of a crow) drops the apple in his lap as he sits atop a mound.[97] Rerir's wife's consumption of the apple results in a six-year pregnancy and the birth (by Caesarean section) of their son—the hero Völsung.[98] +

Further, Davidson points out the "strange" phrase "Apples of Hel" used in an 11th-century poem by the skald Thorbiorn Brúnarson. She states this may imply that the apple was thought of by Brúnarson as the food of the dead. Further, Davidson notes that the potentially Germanic goddess Nehalennia is sometimes depicted with apples and that parallels exist in early Irish stories. Davidson asserts that while cultivation of the apple in Northern Europe extends back to at least the time of the Roman Empire and came to Europe from the Near East, the native varieties of apple trees growing in Northern Europe are small and bitter. Davidson concludes that in the figure of Iðunn "we must have a dim reflection of an old symbol: that of the guardian goddess of the life-giving fruit of the other world."[97] +

+

Greek mythology

+
Heracles with the apple of Hesperides
+

Apples appear in many religious traditions, including Greek and Roman mythology where it has an ambiguous symbolism of discord, fertility, or courtship.[99] In Greek mythology, the Greek hero Heracles, as a part of his Twelve Labours, was required to travel to the Garden of the Hesperides and pick the golden apples off the Tree of Life growing at its center.[100] +

The Greek goddess of discord, Eris, became disgruntled after she was excluded from the wedding of Peleus and Thetis.[101] In retaliation, she tossed a golden apple inscribed Καλλίστη (Kallistē, "For the most beautiful one"), into the wedding party. Three goddesses claimed the apple: Hera, Athena, and Aphrodite. Paris of Troy was appointed to select the recipient. After being bribed by both Hera and Athena, Aphrodite tempted him with the most beautiful woman in the world, Helen of Sparta. He awarded the apple to Aphrodite, thus indirectly causing the Trojan War.[102][103] +

The apple was thus considered, in ancient Greece, sacred to Aphrodite. To throw an apple at someone was to symbolically declare one's love; and similarly, to catch it was to symbolically show one's acceptance of that love. An epigram claiming authorship by Plato states:[104] +

+

I throw the apple at you, and if you are willing to love me, take it and share your girlhood with me; but if your thoughts are what I pray they are not, even then take it, and consider how short-lived is beauty.

— Plato, Epigram VII
+

Atalanta, also of Greek mythology, raced all her suitors in an attempt to avoid marriage. She outran all but Hippomenes (also known as Melanion, a name possibly derived from melon, the Greek word for both "apple" and fruit in general),[100] who defeated her by cunning, not speed. Hippomenes knew that he could not win in a fair race, so he used three golden apples (gifts of Aphrodite, the goddess of love) to distract Atalanta. It took all three apples and all of his speed, but Hippomenes was finally successful, winning the race and Atalanta's hand.[105][106] +

+

Celtic mythology

+

In Celtic mythology, the otherworld has many names, including Emain Ablach, "Emain of the Apple-trees". A version of this is Avalon in Arthurian legend, or in Welsh Ynys Afallon, "Island of Apples".[107] +

+

China

+
Píngānguǒ ("Peace apples") on sale in Beijing for Christmas Eve (2017)
+

In China, apples symbolise peace, since the sounds of the first element ("píng") in the words "apple" (苹果, Píngguǒ) and "peace" (平安, Píng'ān) are homophonous in Mandarin and Cantonese.[3][108] When these two words are combined, the word Píngānguǒ (平安果, "Peace apples") is formed. This association developed further as the name for Christmas Eve in Mandarin is Píngānyè (平安夜, "Peaceful/Quiet Evening"), which made the gifting of apples at this season to friends and associates popular, as a way to wish them peace and safety.[108] +

+

Christian art

+
Adam and Eve by Albrecht Dürer (1507), showcasing the apple as a symbol of sin
+

Though the forbidden fruit of Eden in the Book of Genesis is not identified, popular Christian tradition has held that it was an apple that Eve coaxed Adam to share with her.[109] The origin of the popular identification with a fruit unknown in the Middle East in biblical times is found in wordplay with the Latin words mālum (an apple) and mălum (an evil), each of which is normally written malum.[110] The tree of the forbidden fruit is called "the tree of the knowledge of good and evil" in Genesis 2:17,[111] and the Latin for "good and evil" is bonum et malum.[112] +

Renaissance painters may also have been influenced by the story of the golden apples in the Garden of Hesperides. As a result, in the story of Adam and Eve, the apple became a symbol for knowledge, immortality, temptation, the fall of man into sin, and sin itself. The larynx in the human throat has been called the "Adam's apple" because of a notion that it was caused by the forbidden fruit remaining in the throat of Adam. The apple as symbol of sexual seduction has been used to imply human sexuality, possibly in an ironic vein.[109] +

+

Proverb

+

The proverb, "An apple a day keeps the doctor away", addressing the supposed health benefits of the fruit, has been traced to 19th-century Wales, where the original phrase was "Eat an apple on going to bed, and you'll keep the doctor from earning his bread".[113] In the 19th century and early 20th, the phrase evolved to "an apple a day, no doctor to pay" and "an apple a day sends the doctor away"; the phrasing now commonly used was first recorded in 1922.[114] +

+

See also

+ +

References

+
+
    +
  1. ^ Jump up to: a b c d e f g h i j k Dickson, Elizabeth E. (28 May 2021). "Malus domestica". Flora of North America. Archived from the original on 28 July 2024. Retrieved 27 July 2024. +
  2. +
  3. ^ Jump up to: a b c "Malus domestica (Suckow) Borkh". Plants of the World Online. Royal Botanic Gardens, Kew. Retrieved 31 July 2024. +
  4. +
  5. ^ Jump up to: a b Lim, Lisa (6 July 2021). "Where the word 'apple' came from and why the forbidden fruit was unlucky to be linked with the fall of man". Language Matters. South China Morning Post. Hong Kong, China: Alibaba Group. Archived from the original on 28 June 2023. Retrieved 28 June 2023. +
  6. +
  7. ^ "Origin and meaning of "apple" by Online Etymology Dictionary". Online Etymology Dictionary. Archived from the original on 21 December 2019. Retrieved 22 November 2019. +
  8. +
  9. ^ Jump up to: a b c d e f g Rieger, Mark. "Apple - Malus domestica". HORT 3020: Intro Fruit Crops. University of Georgia. Archived from the original on 21 January 2008. Retrieved 22 January 2008. +
  10. +
  11. ^ Jump up to: a b c "Apples - Malus domestica". North Carolina Extension Gardener Plant Toolbox. North Carolina State University. Archived from the original on 31 May 2024. Retrieved 31 July 2024. +
  12. +
  13. ^ Jump up to: a b c d Heil, Kenneth D.; O'Kane, Jr., Steve L.; Reeves, Linda Mary; Clifford, Arnold (2013). Flora of the Four Corners Region: Vascular Plants of the San Juan River Drainage, Arizona, Colorado, New Mexico, and Utah (First ed.). St. Louis, Missouri: Missouri Botanical Garden. p. 909. ISBN 978-1-930723-84-9. ISSN 0161-1542. LCCN 2012949654. OCLC 859541992. Retrieved 27 July 2024. +
  14. +
  15. ^ Jump up to: a b c d e Lim, Tong Kwee (2012). "Malus x domestica". Edible Medicinal and Non-Medicinal Plants. Vol. 4, Fruit (First ed.). Dordrecht, the Netherlands: Springer. pp. 414–415. doi:10.1007/978-94-007-4053-2_49. ISBN 978-94-007-4053-2. OCLC 795503871. +
  16. +
  17. ^ Juniper, Barrie E.; Mabberley, David J. (2006). The Story of the Apple (First ed.). Portland, Oregon: Timber Press. p. 27. ISBN 978-0-88192-784-9. LCCN 2006011869. OCLC 67383484. Retrieved 1 August 2024. +
  18. +
  19. ^ "Fruit glossary". Royal Horticultural Society. Archived from the original on 7 August 2024. Retrieved 7 August 2024. +
  20. +
  21. ^ Burford, Tom (2013). Apples of North America : 192 Exceptional Varieties for Gardeners, Growers and Cooks (First ed.). Portland, Oregon: Timber Press. pp. 22, 50, 55, 122, 123, 137, 141, 147, 159, 245, 246. ISBN 978-1-60469-249-5. LCCN 2012045130. OCLC 819860825. +
  22. +
  23. ^ "Shape". Western Agricultural Research Center. Montana State University. Archived from the original on 23 April 2024. Retrieved 30 July 2024. +
  24. +
  25. ^ Jump up to: a b Janick, Jules; Cummins, James N.; Brown, Susan K.; Hemmat, Minou (1996). "Chapter 1: Apples" (PDF). Fruit Breeding. Vol. I: Tree and Tropical Fruits. New York: John Wiley & Sons. pp. 9, 48. ISBN 978-0-471-31014-3. LCCN 95016407. OCLC 1302621533. Archived (PDF) from the original on 19 July 2013. Retrieved 30 August 2024. +
  26. +
  27. ^ "Natural Waxes on Fruits". Postharvest.tfrec.wsu.edu. 29 October 2010. Archived from the original on 24 May 2013. Retrieved 14 June 2013. +
  28. +
  29. ^ Flath, R. A.; Black, D. R.; Forrey, R. R.; McDonald, G. M.; Mon, T. R.; Teranishi, R. (1 August 1969). "Volatiles in Gravenstein Apple Essence Identified by GC-Mass Spectrometry". Journal of Chromatographic Science. 7 (8): 508. doi:10.1093/CHROMSCI/7.8.508. +
  30. +
  31. ^ Flath, Robert A.; Black, Dale Robert.; Guadagni, Dante G.; McFadden, William H.; Schultz, Thomas H. (January 1967). "Identification and organoleptic evaluation of compounds in Delicious apple essence". Journal of Agricultural and Food Chemistry. 15 (1): 29. doi:10.1021/jf60149a032. +
  32. +
  33. ^ Jump up to: a b Qian, Guan-Ze; Liu, Lian-Fen; Tang, Geng-Guo (April 2010). "(1933) Proposal to conserve the name Malus domestica against M. pumila, M. communis, M. frutescens, and Pyrus dioica ( Rosaceae )". Taxon. 59 (2): 650–652. doi:10.1002/tax.592038. +
  34. +
  35. ^ Applequist, Wendy L. (2017). "Report of the Nomenclature Committee for Vascular Plants: 69" (PDF). Taxon. 66 (2): 500–513. doi:10.12705/662.17. Archived (PDF) from the original on 7 May 2024. +
  36. +
  37. ^ Wilson, Karen L. (June 2017). "Report of the General Committee: 18". Taxon. 66 (3): 742. doi:10.12705/663.15. +
  38. +
  39. ^ Jump up to: a b Velasco, Riccardo; Zharkikh, Andrey; Affourtit, Jason; Dhingra, Amit; Cestaro, Alessandro; et al. (2010). "The genome of the domesticated apple (Malus × domestica Borkh.)". Nature Genetics. 42 (10): 833–839. doi:10.1038/ng.654. PMID 20802477. S2CID 14854514. +
  40. +
  41. ^ Di Pierro, Erica A.; Gianfranceschi, Luca; Di Guardo, Mario; Koehorst-Van Putten, Herma J.J.; Kruisselbrink, Johannes W.; et al. (2016). "A high-density, multi-parental SNP genetic map on apple validates a new mapping approach for outcrossing species". Horticulture Research. 3 (1): 16057. Bibcode:2016HorR....316057D. doi:10.1038/hortres.2016.57. PMC 5120355. PMID 27917289. +
  42. +
  43. ^ Jump up to: a b Daccord, Nicolas; Celton, Jean-Marc; Linsmith, Gareth; et al. (2017). "High-quality de novo assembly of the apple genome and methylome dynamics of early fruit development". Nature Genetics. 49 (7). Nature Communications: 1099–1106. doi:10.1038/ng.3886. hdl:10449/42064. PMID 28581499. S2CID 24690391. +
  44. +
  45. ^ Jump up to: a b Zhang, Liyi; Hu, Jiang; Han, Xiaolei; Li, Jingjing; Gao, Yuan; et al. (2019). "A high-quality apple genome assembly reveals the association of a retrotransposon and red fruit colour". Nature Communications. 10 (1). Nature Genetics: 1494. Bibcode:2019NatCo..10.1494Z. doi:10.1038/s41467-019-09518-x. PMC 6445120. PMID 30940818. +
  46. +
  47. ^ Jump up to: a b c d e Duan, Naibin; Bai, Yang; Sun, Honghe; Wang, Nan; Ma, Yumin; et al. (2017). "Genome re-sequencing reveals the history of apple and supports a two-stage model for fruit enlargement". Nature Communications. 8 (1): 249. Bibcode:2017NatCo...8..249D. doi:10.1038/s41467-017-00336-7. PMC 5557836. PMID 28811498. +
  48. +
  49. ^ Richards, Christopher M.; Volk, Gayle M.; Reilley, Ann A.; Henk, Adam D.; Lockwood, Dale R.; et al. (2009). "Genetic diversity and population structure in Malus sieversii, a wild progenitor species of domesticated apple". Tree Genetics & Genomes. 5 (2): 339–347. doi:10.1007/s11295-008-0190-9. S2CID 19847067. +
  50. +
  51. ^ Lauri, Pierre-éric; Maguylo, Karen; Trottier, Catherine (March 2006). "Architecture and size relations: an essay on the apple (Malus × domestica, Rosaceae) tree". American Journal of Botany. 93 (3): 357–368. doi:10.3732/ajb.93.3.357. PMID 21646196. Archived from the original on 20 April 2019. Retrieved 27 July 2024. +
  52. +
  53. ^ Cornille, Amandine; Gladieux, Pierre; Smulders, Marinus J. M.; Roldán-Ruiz, Isabel; Laurens, François; et al. (2012). Mauricio, Rodney (ed.). "New Insight into the History of Domesticated Apple: Secondary Contribution of the European Wild Apple to the Genome of Cultivated Varieties". PLOS Genetics. 8 (5): e1002703. doi:10.1371/journal.pgen.1002703. PMC 3349737. PMID 22589740. +
  54. +
  55. ^ Kean, Sam (17 May 2012). "ScienceShot: The Secret History of the Domesticated Apple". Archived from the original on 11 June 2016. +
  56. +
  57. ^ Coart, E.; Van Glabeke, S.; De Loose, M.; Larsen, A.S.; Roldán-Ruiz, I. (2006). "Chloroplast diversity in the genus Malus: new insights into the relationship between the European wild apple (Malus sylvestris (L.) Mill.) and the domesticated apple (Malus domestica Borkh.)". Mol. Ecol. 15 (8): 2171–2182. Bibcode:2006MolEc..15.2171C. doi:10.1111/j.1365-294x.2006.02924.x. PMID 16780433. S2CID 31481730. +
  58. +
  59. ^ Rottoli, Mauro; Pessina, Andrea (2007). "Chapter 9: Neolithic agriculture in Italy: an update of archaeobotanical data with particular emphasis on northern settlements". In Colledge, Sue; Conolly, James (eds.). The Origins and Spread of Domestic Plants in Southwest Asia and Europe (First ed.). Walnut Creek, California: Left Coast Press; University College London Institute of Archaeology Publications. pp. 142–143. ISBN 978-1-59874-988-5. OCLC 84838157. +
  60. +
  61. ^ Jump up to: a b c d Schlumbaum, Angela; van Glabeke, Sabine; Roldan-Ruiz, Isabel (January 2012). "Towards the onset of fruit tree growing north of the Alps: Ancient DNA from waterlogged apple (Malus sp.) seed fragments". Annals of Anatomy - Anatomischer Anzeiger. 194 (1): 157–162. doi:10.1016/j.aanat.2011.03.004. PMID 21501956. +
  62. +
  63. ^ Sauer, Jonathan D. (1993). Historical Geography of Crop Plants: A Select Roster (First ed.). Boca Raton, Florida: CRC Press. pp. 109–113. ISBN 978-0-8493-8901-6. LCCN 92045590. OCLC 27224696. +
  64. +
  65. ^ Plinius, Gaius Secundus (1855). The Natural History of Pliny. Vol. III. Translated by Bostock, John; Riley, Henry T. London: Henry G. Bohn. p. 303. Retrieved 3 August 2024. +
  66. +
  67. ^ Martin, Alice A. (1976). All About Apples (First ed.). Boston, Massachusetts: Houghton Mifflin Company. pp. 64–65. ISBN 978-0-395-20724-6. OCLC 1733691. Retrieved 3 August 2024. +
  68. +
  69. ^ Adamson, Melitta Weiss (2004). Food in Medieval Times (First ed.). Westport, Connecticut: Greenwood Press. pp. 19–20. ISBN 978-0-313-32147-4. LCCN 2004014054. OCLC 55738647. +
  70. +
  71. ^ Torrejón, Fernando; Cisternas, Marco; Araneda, Alberto (2004). "Efectos ambientales de la colonización española desde el río Maullín al archipiélago de Chiloé, sur de Chile" [Environmental effects of the spanish colonization from de Maullín river to the Chiloé archipelago, southern Chile]. Revista Chilena de Historia Natural (in Spanish). 77 (4): 661–677. doi:10.4067/s0716-078x2004000400009. +
  72. +
  73. ^ Smith, Archibald William (1963). A Gardener's Book of Plant Names : A Handbook of the Meaning and Origins of Plant Names (First ed.). New York: Harper & Row. p. 40. LCCN 62009906. OCLC 710612. Retrieved 10 August 2024. +
  74. +
  75. ^ Jump up to: a b c Poole, Mike (1980). "Heirloom Apples". In Lawrence, James (ed.). The Harrowsmith Reader Volume II. Camden East, Ontario: Camden House Publishing. p. 122. ISBN 978-0-920656-11-2. OCLC 1336124440. Retrieved 10 August 2024. +
  76. +
  77. ^ Van Valen, James M. (1900). History of Bergen County, New Jersey. New York: New Jersey Publishing and Engraving Company. pp. 33–34. OCLC 25697876. Retrieved 9 August 2024. +
  78. +
  79. ^ Brox, Jane (1999). Five Thousand Days Like This One (First ed.). Boston, Massachusetts: Beacon Press. pp. 150–151. ISBN 978-0-8070-2106-4. LCCN 98035051. OCLC 39605684. Retrieved 9 August 2024. +
  80. +
  81. ^ Cohen, Rachel D. (26 November 2018). "Thanks To Science, You Can Eat An Apple Every Day". The Salt. NPR. Archived from the original on 18 June 2024. Retrieved 1 August 2024. +
  82. +
  83. ^ "The Heirloom Apple Orchard". The Jentsch Lab. Cornell University. Archived from the original on 30 July 2024. Retrieved 9 August 2024. +
  84. +
  85. ^ Ranney, Thomas G. "Polyploidy: From Evolution to Landscape Plant Improvement". Proceedings of the 11th Metropolitan Tree Improvement Alliance (METRIA) Conference. 11th Metropolitan Tree Improvement Alliance Conference held in Gresham, Oregon, August 23–24, 2000. METRIA (NCSU.edu). METRIA. Archived from the original on 23 July 2010. Retrieved 7 November 2010. +
  86. +
  87. ^ Lord, William G.; Ouellette, Amy (February 2010). "Dwarf Rootstocks for Apple Trees in the Home Garden" (PDF). University of New Hampshire. Archived from the original (PDF) on 30 September 2013. Retrieved 1 September 2013. +
  88. +
  89. ^ Fallahi, Esmaeil; Colt, W. Michael; Fallahi, Bahar; Chun, Ik-Jo (January 2002). "The Importance of Apple Rootstocks on Tree Growth, Yield, Fruit Quality, Leaf Nutrition, and Photosynthesis with an Emphasis on 'Fuji'". HortTechnology. 12 (1): 38–44. doi:10.21273/HORTTECH.12.1.38. Archived (PDF) from the original on 11 February 2014. Retrieved 9 August 2024. +
  90. +
  91. ^ Parker, M.L. (September 1993). "Apple Rootstocks and Tree Spacing". North Carolina Cooperative Extension Service. Archived from the original on 11 September 2013. Retrieved 1 September 2013. +
  92. +
  93. ^ Ferree, David Curtis; Warrington, Ian J. (2003). Apples: Botany, Production, and Uses. New York: Centre for Agriculture and Bioscience International. pp. 33–35. ISBN 978-0851995922. OCLC 133167834. +
  94. +
  95. ^ Jump up to: a b c d Polomski, Bob; Reighard, Greg. "Apple HGIC 1350". Home & Garden Information Center. Clemson University. Archived from the original on 28 February 2008. Retrieved 22 January 2008. +
  96. +
  97. ^ Barahona, M. (1992). "Adaptation of Apple Varieties in Ecuador". Acta Horticulturae (310): 135–142. doi:10.17660/ActaHortic.1992.310.17. +
  98. +
  99. ^ Adamson, Nancy Lee (2011). An Assessment of Non-Apis Bees as Fruit and Vegetable Crop Pollinators in Southwest Virginia (PDF) (Doctor of Philosophy in Entomology thesis). Virginia Polytechnic Institute and State University. Archived (PDF) from the original on 20 November 2015. Retrieved 15 October 2015. +
  100. +
  101. ^ Powell, L.E. (1986). "The Chilling Requirement in Apple and Its Role in Regulating Time of Flowering in Spring in Cold-Winter Climate". Acta Horticulturae (179). Wageningen, Netherlands: International Society for Horticultural Science: 129–140. doi:10.17660/ActaHortic.1986.179.10. ISBN 978-90-6605-182-9. +
  102. +
  103. ^ Romano, Andrea (10 September 2023). "20 Best Places to Go Apple Picking in the United States". Travel + Leisure. Archived from the original on 21 April 2024. Retrieved 2 August 2024. +
  104. +
  105. ^ Graziano, Jack; Farcuh, Macarena (10 September 2021). "Controlled Atmosphere Storage of Apples". University of Maryland Extension. Archived from the original on 24 March 2023. Retrieved 2 August 2024. +
  106. +
  107. ^ "FoodKeeper App". FoodSafety.gov. United States Department of Health and Human Services. 26 April 2019. Retrieved 17 September 2024. +
  108. +
  109. ^ "4 Steps to Food Safety". FoodSafety.gov. United States Department of Health and Human Services. 12 April 2019. Retrieved 17 September 2024. +
  110. +
  111. ^ "Refrigerated storage of perishable foods". CSIRO. 26 February 2015. Archived from the original on 15 March 2015. Retrieved 25 May 2007. +
  112. +
  113. ^ Karp, David (25 October 2006). "Puff the Magic Preservative: Lasting Crunch, but Less Scent". The New York Times. Archived from the original on 3 August 2011. Retrieved 26 July 2017. +
  114. +
  115. ^ Jackson, H.S. (1914). "Powdery Mildew". In Lowther, Granville; Worthington, William (eds.). The Encyclopedia of Practical Horticulture: A Reference System of Commercial Horticulture, Covering the Practical and Scientific Phases of Horticulture, with Special Reference to Fruits and Vegetables. Vol. I. North Yakima, Washington: The Encyclopedia of Horticulture Corporation. pp. 475–476. Retrieved 1 August 2024. +
  116. +
  117. ^ Lowther, Granville; Worthington, William, eds. (1914). The Encyclopedia of Practical Horticulture: A Reference System of Commercial Horticulture, Covering the Practical and Scientific Phases of Horticulture, with Special Reference to Fruits and Vegetables. Vol. I. North Yakima, Washington: The Encyclopedia of Horticulture Corporation. pp. 45–51. Retrieved 1 August 2024. +
  118. +
  119. ^ Coli, William M.; Los, Lorraine M., eds. (2003). "Insect Pests". 2003-2004 New England Apple Pest Management Guide. University of Massachusetts Amherst. pp. 28–29. Archived from the original on 12 February 2008. Retrieved 3 March 2008.{{cite book}}: CS1 maint: bot: original URL status unknown (link) +
  120. +
  121. ^ Jump up to: a b Atthowe, Helen; Gilkeson, Linda A.; Kite, L. Patricia; Michalak, Patricia S.; Pleasant, Barbara; Reich, Lee; Scheider, Alfred F. (2009). Bradley, Fern Marshall; Ellis, Bardara W.; Martin, Deborah L. (eds.). The Organic Gardener's Handbook of Natural Pest and Disease Control. New York: Rodale, Inc. pp. 32–34. ISBN 978-1-60529-677-7. LCCN 2009039996. OCLC 419860680. +
  122. +
  123. ^ Coli, William M.; Berkett, Lorraine P.; Spitko, Robin, eds. (2003). "Other Apple Diseases". 2003-2004 New England Apple Pest Management Guide. University of Massachusetts Amherst. pp. 19–27. Archived from the original on 12 February 2008. Retrieved 3 March 2008.{{cite book}}: CS1 maint: bot: original URL status unknown (link) +
  124. +
  125. ^ Martin, Phillip L.; Krawczyk, Teresa; Khodadadi, Fatemeh; Aćimović, Srđan G.; Peter, Kari A. (2021). "Bitter Rot of Apple in the Mid-Atlantic United States: Causal Species and Evaluation of the Impacts of Regional Weather Patterns and Cultivar Susceptibility". Phytopathology. 111 (6): 966–981. doi:10.1094/PHYTO-09-20-0432-R. ISSN 0031-949X. PMID 33487025. S2CID 231701083. +
  126. +
  127. ^ Erler, Fedai (1 January 2010). "Efficacy of tree trunk coating materials in the control of the apple clearwing, Synanthedon myopaeformis". Journal of Insect Science. 10 (1): 63. doi:10.1673/031.010.6301. PMC 3014806. PMID 20672979. +
  128. +
  129. ^ Elzebroek, A. T. G.; Wind, Koop (2008). Guide to Cultivated Plants. Wallingford, United Kingdom: CABI. p. 27. ISBN 978-1-84593-356-2. LCCN 2007028459. OCLC 156975183. Archived from the original on 20 October 2020. Retrieved 6 October 2020. +
  130. +
  131. ^ Jump up to: a b "Apple – Malus domestica". Natural England. Archived from the original on 12 May 2008. Retrieved 22 January 2008. +
  132. +
  133. ^ "Home". National Fruit Collection. Archived from the original on 15 June 2012. Retrieved 2 December 2012. +
  134. +
  135. ^ "ECPGR Malus/Pyrus Working Group Members". Ecpgr.cgiar.org. 22 July 2002. Archived from the original on 26 August 2014. Retrieved 25 August 2014. +
  136. +
  137. ^ Jump up to: a b Tarjan, Sue (Fall 2006). "Autumn Apple Musings" (PDF). News & Notes of the UCSC Farm & Garden, Center for Agroecology & Sustainable Food Systems. pp. 1–2. Archived from the original (PDF) on 11 August 2007. Retrieved 24 January 2008. +
  138. +
  139. ^ Beck, Kellen (17 October 2020). "How breeders bring out the best in new apples". Mashable. Archived from the original on 31 July 2024. Retrieved 31 July 2024. +
  140. +
  141. ^ Migicovsky, Zoë (22 August 2021). "How a few good apples spawned today's top varieties — and why breeders must branch out". The Conversation. Archived from the original on 31 July 2024. Retrieved 31 July 2024. +
  142. +
  143. ^ Peil, A.; Dunemann, F.; Richter, K.; Hoefer, M.; Király, I.; Flachowsky, H.; Hanke, M.-V. (2008). "Resistance Breeding in Apple at Dresden-Pillnitz". Ecofruit - 13th International Conference on Cultivation Technique and Phytopathological Problems in Organic Fruit-Growing: Proceedings to the Conference from 18thFebruary to 20th February 2008 at Weinsberg/Germany (in German): 220–225. Archived from the original on 28 January 2021. Retrieved 31 July 2024. +
  144. +
  145. ^ Jump up to: a b "World apple situation". Archived from the original on 11 February 2008. Retrieved 24 January 2008. +
  146. +
  147. ^ Weaver, Sue (June–July 2003). "Crops & Gardening – Apples of Antiquity". Hobby Farms Magazine. Archived from the original on 19 February 2017. +
  148. +
  149. ^ Jump up to: a b c "Apple production in 2022; from pick lists: Crops/World Regions/Production Quantity". FAOSTAT, UN Food and Agriculture Organization, Statistics Division. 2024. Archived from the original on 12 November 2016. Retrieved 18 June 2024. +
  150. +
  151. ^ Nelson, Lewis S.; Shih, Richard D.; Balick, Michael J. (2007). Handbook of Poisonous and Injurious Plants (Second ed.). New York: New York Botanical Garden : Springer. pp. 27, 211–212. ISBN 978-0387-31268-2. LCCN 2005938815. OCLC 77537459. Retrieved 11 September 2024. +
  152. +
  153. ^ "Amygdalin". Toxnet, US Library of Medicine. Archived from the original on 21 April 2017. Retrieved 20 April 2017. +
  154. +
  155. ^ Jump up to: a b c d e f "General Information – Apple". Informall. Archived from the original on 23 July 2012. Retrieved 17 October 2011. +
  156. +
  157. ^ Landau, Elizabeth, Oral allergy syndrome may explain mysterious reactions, 8 April 2009, CNN Health, accessed 17 October 2011 +
  158. +
  159. ^ United States Food and Drug Administration (2024). "Daily Value on the Nutrition and Supplement Facts Labels". FDA. Archived from the original on 27 March 2024. Retrieved 28 March 2024. +
  160. +
  161. ^ National Academies of Sciences, Engineering, and Medicine; Health and Medicine Division; Food and Nutrition Board; Committee to Review the Dietary Reference Intakes for Sodium and Potassium (2019). Oria, Maria; Harrison, Meghan; Stallings, Virginia A. (eds.). Dietary Reference Intakes for Sodium and Potassium. The National Academies Collection: Reports funded by National Institutes of Health. Washington, DC: National Academies Press (US). ISBN 978-0-309-48834-1. PMID 30844154. Archived from the original on 9 May 2024. Retrieved 21 June 2024. +
  162. +
  163. ^ Jump up to: a b c d Davidson, Alan (2014). "Apple". In Jaine, Tom (ed.). The Oxford Companion to Food. Illustrated by Soun Vannithone (Third ed.). Oxford: Oxford University Press. pp. 27–31. ISBN 978-0-19-967733-7. LCCN 2013957569. OCLC 890807357. OL 27172691M. Retrieved 18 September 2024. +
  164. +
  165. ^ Traverso, Amy (2011). The Apple Lover's Cookbook. Photographs by Squire Fox (First ed.). New York: W.W. Norton & Company. pp. 16, 32, 35, 45, 92, 137, 262–263, 275. ISBN 978-0-393-06599-2. LCCN 2011016560. OCLC 711051767. OL 16450839W. +
  166. +
  167. ^ Kellogg, Kristi (15 January 2015). "81 Best Apple Recipes: Dinners, Desserts, Salads, and More". Epicurious. Archived from the original on 18 October 2020. Retrieved 17 October 2020. +
  168. +
  169. ^ Davidson, Alan (2014). "Toffee Apple". In Jaine, Tom (ed.). The Oxford Companion to Food. Illustrated by Soun Vannithone (Third ed.). Oxford: Oxford University Press. p. 824. ISBN 978-0-19-967733-7. LCCN 2013957569. OCLC 890807357. OL 27172691M. Retrieved 18 September 2024. +
  170. +
  171. ^ Shurpin, Yehuda. "Why All the Symbolic Rosh Hashanah Foods? "בולבול"". Chabad.org. Archived from the original on 21 March 2023. Retrieved 21 March 2023. +
  172. +
  173. ^ Yepsen, Roger B. (2017) [1994]. Apples (Revised and Updated ed.). New York: W.W. Norton & Company. p. 52. ISBN 978-1-68268-019-3. LCCN 2017010136. OCLC 973918728. +
  174. +
  175. ^ "Organic apples". USDA Agricultural Marketing Service. February 2016. Archived from the original on 24 February 2017. Retrieved 23 February 2017. +
  176. +
  177. ^ Jump up to: a b "European Organic Apple Production Demonstrates the Value of Pesticides" (PDF). CropLife Foundation, Washington, DC. December 2011. Archived (PDF) from the original on 24 February 2017. Retrieved 23 February 2017. +
  178. +
  179. ^ Ribeiro, Flávia A.P.; Gomes de Moura, Carolina F.; Aguiar, Odair; de Oliveira, Flavia; Spadari, Regina C.; Oliveira, Nara R.C.; Oshima, Celina T.F.; Ribeiro, Daniel A. (September 2014). "The chemopreventive activity of apple against carcinogenesis: antioxidant activity and cell cycle control". European Journal of Cancer Prevention (Review). 23 (5): 477–480. doi:10.1097/CEJ.0000000000000005. PMID 24366437. S2CID 23026644. +
  180. +
  181. ^ Nicolas, J. J.; Richard-Forget, F. C.; Goupy, P. M.; Amiot, M. J.; Aubert, S. Y. (1 January 1994). "Enzymatic browning reactions in apple and apple products". Critical Reviews in Food Science and Nutrition. 34 (2): 109–157. doi:10.1080/10408399409527653. PMID 8011143. +
  182. +
  183. ^ "PPO silencing". Okanagan Specialty Fruits. 2019. Archived from the original on 27 April 2021. Retrieved 14 November 2019. +
  184. +
  185. ^ "United States: GM non-browning Arctic apple expands into foodservice". Fresh Fruit Portal. 13 August 2019. Archived from the original on 27 June 2021. Retrieved 14 November 2019. +
  186. +
  187. ^ "Okanagan Specialty Fruits: Biotechnology Consultation Agency Response Letter BNF 000132". U.S. Food and Drug Administration. 20 March 2015. Archived from the original on 31 October 2017. Retrieved 14 November 2019. +
  188. +
  189. ^ "Questions and answers: Arctic Apple". Canadian Food Inspection Agency, Government of Canada. 8 September 2017. Archived from the original on 19 September 2018. Retrieved 14 November 2019. +
  190. +
  191. ^ Yu, Xiuzhu; Van De Voort, Frederick R.; Li, Zhixi; Yue, Tianli (2007). "Proximate Composition of the Apple Seed and Characterization of Its Oil". International Journal of Food Engineering. 3 (5). doi:10.2202/1556-3758.1283. S2CID 98590230. +
  192. +
  193. ^ Jump up to: a b c Davidson, Hilda Roderick Ellis (1990) [1st pub. 1964]. Gods and Myths of Northern Europe. London: Penguin Books. pp. 165–166. ISBN 0-14-013627-4. OCLC 29336401. +
  194. +
  195. ^ Davidson, Hilda Ellis (1998). Roles of the Northern Goddess. London; New York: Routledge. pp. 146–147. doi:10.4324/9780203025550. ISBN 0-415-13610-5. LCCN 97018309. OCLC 48138055. +
  196. +
  197. ^ Biedermann, Hans (1992). Dictionary of Symbolism. Translated by Hulbert, James. New York: Facts on File. pp. 16–17. ISBN 978-0-8160-2593-0. LCCN 91044933. OCLC 25092926. Retrieved 3 October 2024. +
  198. +
  199. ^ Jump up to: a b Ruck, Carl A. P.; Staples, Blaise D.; Heinrich, Clark (2001). The apples of Apollo : pagan and Christian mysteries of the Eucharist. Durham, North Carolina: Carolina Academic Press. pp. 64–70. ISBN 978-0-89089-924-3. LCCN 00040351. OCLC 46337324. +
  200. +
  201. ^ "Eris - Greek Goddess of Strife & Discord (Roman Discordia)". Theoi Project. Aaron J. Atsma. Archived from the original on 25 September 2024. Retrieved 26 September 2024. +
  202. +
  203. ^ Lucian (1905). The Works of Lucian of Samosata. Vol. I. Translated by Fowler, H.W.; Fowler, F.G. (First ed.). Oxford: Clarendon Press. pp. 78–85. LCCN 06001045. OCLC 506365. Retrieved 26 September 2024. +
  204. +
  205. ^ "Judgement of Paris - Greek Mythology". Theoi Project. Aaron J. Atsma. Archived from the original on 24 August 2024. Retrieved 26 September 2024. +
  206. +
  207. ^ Plato (1997). "Epigrams". In Cooper, John M.; Hutchinson, D.S. (eds.). Complete Works. Translated by Edmonds, J.M.; Cooper, John M. Indianapolis, Indiana: Hackett Publishing. p. 1744. ISBN 0-87220-349-2. LCCN 96053280. OCLC 36178550. Retrieved 27 September 2024. +
  208. +
  209. ^ Pinsent, John (1969). Greek Mythology (First ed.). London: Paul Hamlyn. p. 79. ISBN 978-0-600-02422-4. LCCN 78449216. OCLC 61702. Retrieved 3 October 2024. +
  210. +
  211. ^ "Atalanta (Atalante) - Arcadian Heroine of Greek Mythology". Theoi Project. Aaron J. Atsma. Archived from the original on 27 September 2024. Retrieved 3 October 2024. +
  212. +
  213. ^ Flieger, Verlyn (2005). Interrupted Music : The Making of Tolkien's Mythology. Kent, Ohio: Kent State University Press. pp. 122–123. ISBN 978-0-87338-824-5. LCCN 2004024490. OCLC 56805947. +
  214. +
  215. ^ Jump up to: a b "Why Do the Chinese Give Apples Around Christmas?". Teach English In China. 22 December 2019. Archived from the original on 1 October 2020. Retrieved 3 September 2024. +
  216. +
  217. ^ Jump up to: a b Macrone, Michael (1998). Brush up your Bible!. New York: Gramercy Books. pp. 15–16, 340–341. ISBN 978-0-517-20189-3. OCLC 38270894. Retrieved 31 July 2024. +
  218. +
  219. ^ Kissling, Paul J. (2004). Genesis. Vol. 1. Joplin, Missouri: College Press. p. 193. ISBN 978-0-89900-875-2. LCCN 2004022577. OCLC 56672257. Archived from the original on 26 January 2021. Retrieved 6 October 2020. +
  220. +
  221. ^ Genesis 2:17 +
  222. +
  223. ^ Hendel, Ronald S. (2013). The Book of Genesis: A Biography. Princeton, New Jersey: Princeton University Press. p. 114. ISBN 978-0-69114012-4. LCCN 2012015634. OCLC 788265521. Archived from the original on 5 March 2023. Retrieved 4 October 2024. +
  224. +
  225. ^ Mieder, Wolfgang; Kingsbury, Stewart A.; Harder, Kelsie B., eds. (1996) [1992]. A Dictionary of American Proverbs (Paperback ed.). New York: Oxford University Press. p. 23. ISBN 978-0-19-511133-0. LCCN 91015508. OCLC 23693799. Retrieved 23 August 2024. +
  226. +
  227. ^ Pollan, Michael (2001). The Botany of Desire: A Plant's-Eye View of the World (First ed.). New York: Random House. pp. 9, 22, 50. ISBN 978-0-375-50129-6. LCCN 00066479. OCLC 49803415. +
  228. +
+

Further reading

+ +
+
  • Media related to Apples at Wikimedia Commons
+ + + + + + + + + + + +
+
+ +
+
+ +
+ +
+
+
+
    +
    + + +
    \ No newline at end of file diff --git a/tests/async/test_content_scraper_strategy.py b/tests/async/test_content_scraper_strategy.py new file mode 100644 index 00000000..5dfa6362 --- /dev/null +++ b/tests/async/test_content_scraper_strategy.py @@ -0,0 +1,162 @@ +import asyncio +from bs4 import BeautifulSoup +from typing import Dict, Any +import os +import sys +import time +import csv +from tabulate import tabulate +from dataclasses import dataclass +from typing import List, Dict + +parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.append(parent_dir) +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + +from crawl4ai.content_scrapping_strategy import WebScrapingStrategy +from crawl4ai.content_scrapping_strategy import WebScrapingStrategy as WebScrapingStrategyCurrent +# from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent + +@dataclass +class TestResult: + name: str + success: bool + images: int + internal_links: int + external_links: int + markdown_length: int + execution_time: float + +class StrategyTester: + def __init__(self): + self.new_scraper = WebScrapingStrategy() + self.current_scraper = WebScrapingStrategyCurrent() + with open(__location__ + '/sample_wikipedia.html', 'r', encoding='utf-8') as f: + self.WIKI_HTML = f.read() + self.results = {'new': [], 'current': []} + + def run_test(self, name: str, **kwargs) -> tuple[TestResult, TestResult]: + results = [] + for scraper in [self.new_scraper, self.current_scraper]: + start_time = time.time() + result = scraper._get_content_of_website_optimized( + url="https://en.wikipedia.org/wiki/Test", + html=self.WIKI_HTML, + **kwargs + ) + execution_time = time.time() - start_time + + test_result = TestResult( + name=name, + success=result['success'], + images=len(result['media']['images']), + internal_links=len(result['links']['internal']), + external_links=len(result['links']['external']), + markdown_length=len(result['markdown']), + execution_time=execution_time + ) + results.append(test_result) + + return results[0], results[1] # new, current + + def run_all_tests(self): + test_cases = [ + ("Basic Extraction", {}), + ("Exclude Tags", {'excluded_tags': ['table', 'div.infobox', 'div.navbox']}), + ("Word Threshold", {'word_count_threshold': 50}), + ("CSS Selector", {'css_selector': 'div.mw-parser-output > p'}), + ("Link Exclusions", { + 'exclude_external_links': True, + 'exclude_social_media_links': True, + 'exclude_domains': ['facebook.com', 'twitter.com'] + }), + ("Media Handling", { + 'exclude_external_images': True, + 'image_description_min_word_threshold': 20 + }), + ("Text Only", { + 'only_text': True, + 'remove_forms': True + }), + ("HTML Cleaning", { + 'clean_html': True, + 'keep_data_attributes': True + }), + ("HTML2Text Options", { + 'html2text': { + 'skip_internal_links': True, + 'single_line_break': True, + 'mark_code': True, + 'preserve_tags': ['pre', 'code'] + } + }) + ] + + all_results = [] + for name, kwargs in test_cases: + try: + new_result, current_result = self.run_test(name, **kwargs) + all_results.append((name, new_result, current_result)) + except Exception as e: + print(f"Error in {name}: {str(e)}") + + self.save_results_to_csv(all_results) + self.print_comparison_table(all_results) + + def save_results_to_csv(self, all_results: List[tuple]): + csv_file = os.path.join(__location__, 'strategy_comparison_results.csv') + with open(csv_file, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(['Test Name', 'Strategy', 'Success', 'Images', 'Internal Links', + 'External Links', 'Markdown Length', 'Execution Time']) + + for name, new_result, current_result in all_results: + writer.writerow([name, 'New', new_result.success, new_result.images, + new_result.internal_links, new_result.external_links, + new_result.markdown_length, f"{new_result.execution_time:.3f}"]) + writer.writerow([name, 'Current', current_result.success, current_result.images, + current_result.internal_links, current_result.external_links, + current_result.markdown_length, f"{current_result.execution_time:.3f}"]) + + def print_comparison_table(self, all_results: List[tuple]): + table_data = [] + headers = ['Test Name', 'Strategy', 'Success', 'Images', 'Internal Links', + 'External Links', 'Markdown Length', 'Time (s)'] + + for name, new_result, current_result in all_results: + # Check for differences + differences = [] + if new_result.images != current_result.images: differences.append('images') + if new_result.internal_links != current_result.internal_links: differences.append('internal_links') + if new_result.external_links != current_result.external_links: differences.append('external_links') + if new_result.markdown_length != current_result.markdown_length: differences.append('markdown') + + # Add row for new strategy + new_row = [ + name, 'New', new_result.success, new_result.images, + new_result.internal_links, new_result.external_links, + new_result.markdown_length, f"{new_result.execution_time:.3f}" + ] + table_data.append(new_row) + + # Add row for current strategy + current_row = [ + '', 'Current', current_result.success, current_result.images, + current_result.internal_links, current_result.external_links, + current_result.markdown_length, f"{current_result.execution_time:.3f}" + ] + table_data.append(current_row) + + # Add difference summary if any + if differences: + table_data.append(['', '⚠️ Differences', ', '.join(differences), '', '', '', '', '']) + + # Add empty row for better readability + table_data.append([''] * len(headers)) + + print("\nStrategy Comparison Results:") + print(tabulate(table_data, headers=headers, tablefmt='grid')) + +if __name__ == "__main__": + tester = StrategyTester() + tester.run_all_tests() \ No newline at end of file From 17913f5acf28cfac775085b74496d1ed5aafcae6 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 13 Nov 2024 20:00:29 +0800 Subject: [PATCH 018/115] feat(crawler): support local files and raw HTML input in AsyncWebCrawler --- crawl4ai/async_webcrawler.py | 49 +++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 9d0340dc..8415f9b9 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -104,6 +104,10 @@ class AsyncWebCrawler: extracted_content = None is_web_url = url.startswith(('http://', 'https://')) + is_local_file = url.startswith("file://") + is_raw_html = url.startswith("raw:") + _url = url if not is_raw_html else "Raw HTML" + if is_web_url and not bypass_cache and not self.always_by_pass_cache: cached = await async_db_manager.aget_cached_url(url) @@ -131,7 +135,7 @@ class AsyncWebCrawler: t2 = time.time() if verbose: print( - f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds" + f"[LOG] 🚀 Crawling done for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds" ) crawl_result = await self.aprocess_html( @@ -147,6 +151,9 @@ class AsyncWebCrawler: is_cached=bool(cached), async_response=async_response, bypass_cache=bypass_cache, + is_web_url = is_web_url, + is_local_file = is_local_file, + is_raw_html = is_raw_html, **kwargs, ) @@ -164,8 +171,8 @@ class AsyncWebCrawler: except Exception as e: if not hasattr(e, "msg"): e.msg = str(e) - print(f"[ERROR] 🚫 arun(): Failed to crawl {url}, error: {e.msg}") - return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {url}, error: {e.msg}", success=False, error_message=e.msg) + print(f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}") + return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}", success=False, error_message=e.msg) async def arun_many( self, @@ -233,6 +240,7 @@ class AsyncWebCrawler: t = time.time() # Extract content from HTML try: + _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" t1 = time.time() scrapping_strategy = WebScrapingStrategy() # result = await scrapping_strategy.ascrap( @@ -249,7 +257,7 @@ class AsyncWebCrawler: ) if verbose: print( - f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds" + f"[LOG] 🚀 Content extracted for {_url}, success: True, time taken: {time.time() - t1:.2f} seconds" ) if result is None: @@ -270,7 +278,7 @@ class AsyncWebCrawler: if extracted_content is None and extraction_strategy and chunking_strategy: if verbose: print( - f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {self.__class__.__name__}" + f"[LOG] 🔥 Extracting semantic blocks for {_url}, Strategy: {self.__class__.__name__}" ) # Check if extraction strategy is type of JsonCssExtractionStrategy @@ -285,7 +293,7 @@ class AsyncWebCrawler: if verbose: print( - f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t:.2f} seconds." + f"[LOG] 🚀 Extraction done for {_url}, time taken: {time.time() - t:.2f} seconds." ) screenshot = None if not screenshot else screenshot @@ -296,20 +304,21 @@ class AsyncWebCrawler: response_headers = json.dumps(async_response.response_headers, ensure_ascii=False) - if not is_cached or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: - await async_db_manager.acache_url( - url, - html, - cleaned_html, - markdown, - extracted_content, - True, - json.dumps(media), - json.dumps(links), - json.dumps(metadata), - screenshot=screenshot, - response_headers=response_headers, - ) + if not kwargs.get("is_raw_html", False): + if not is_cached or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: + await async_db_manager.acache_url( + url, + html, + cleaned_html, + markdown, + extracted_content, + True, + json.dumps(media), + json.dumps(links), + json.dumps(metadata), + screenshot=screenshot, + response_headers=response_headers, + ) return CrawlResult( url=url, From 3d00fee6c28e16556c7a51035586faad7f5e1639 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 14 Nov 2024 22:50:59 +0800 Subject: [PATCH 019/115] - In this commit, the library is updated to process file downloads. Users can now specify a download folder and trigger the download process via JavaScript or other means, with all files being saved. The list of downloaded files will also be added to the crowd result object. - Another thing this commit introduces is the concept of the Relevance Content Filter. This is an improvement over Fit Markdown. This class of strategies aims to extract the main content from a given page - the part that really matters and is useful to be processed. One strategy has been created using the BM25 algorithm, which finds chunks of text from the web page relevant to its title, descriptions, and keywords, or supports a given user query and matches them. The result is then returned to the main engine to be converted to Markdown. Plans include adding approaches using language models as well. - The cache database was updated to hold information about response headers and downloaded files. --- crawl4ai/async_crawler_strategy.py | 63 +- crawl4ai/async_crawler_strategy_0.3.73.py | 965 ---------------------- crawl4ai/async_database.py | 22 +- crawl4ai/async_webcrawler.py | 47 +- crawl4ai/content_cleaning_strategy.py | 198 ----- crawl4ai/content_filter_strategy.py | 344 ++++++++ crawl4ai/content_scrapping_strategy.py | 14 +- crawl4ai/models.py | 18 +- crawl4ai/utils.py | 55 ++ tests/async/test_async_doanloader.py | 229 +++++ 10 files changed, 739 insertions(+), 1216 deletions(-) delete mode 100644 crawl4ai/async_crawler_strategy_0.3.73.py delete mode 100644 crawl4ai/content_cleaning_strategy.py create mode 100644 crawl4ai/content_filter_strategy.py create mode 100644 tests/async/test_async_doanloader.py diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index baa06e47..83933a35 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -14,6 +14,7 @@ from pydantic import BaseModel import hashlib import json import uuid +from .models import AsyncCrawlResponse from playwright_stealth import StealthConfig, stealth_async @@ -148,15 +149,6 @@ class ManagedBrowser: except Exception as e: print(f"Error removing temporary directory: {e}") -class AsyncCrawlResponse(BaseModel): - html: str - response_headers: Dict[str, str] - status_code: int - screenshot: Optional[str] = None - get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None - - class Config: - arbitrary_types_allowed = True class AsyncCrawlerStrategy(ABC): @abstractmethod @@ -215,6 +207,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): 'before_retrieve_html': None } self.extra_args = kwargs.get("extra_args", []) + self.accept_downloads = kwargs.get("accept_downloads", False) + self.downloads_path = kwargs.get("downloads_path") + self._downloaded_files = [] # Track downloaded files for current crawl + if self.accept_downloads and not self.downloads_path: + self.downloads_path = os.path.join(os.getcwd(), "downloads") + os.makedirs(self.downloads_path, exist_ok=True) + async def __aenter__(self): await self.start() @@ -250,7 +249,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Set up the default context if self.default_context: await self.default_context.set_extra_http_headers(self.headers) - + if self.accept_downloads: + await self.default_context.set_default_timeout(60000) + await self.default_context.set_default_navigation_timeout(60000) + self.default_context._impl_obj._options["accept_downloads"] = True + self.default_context._impl_obj._options["downloads_path"] = self.downloads_path + if self.user_agent: await self.default_context.set_extra_http_headers({ "User-Agent": self.user_agent @@ -301,12 +305,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.use_persistent_context and self.user_data_dir: self.browser = await self.playwright.chromium.launch_persistent_context( user_data_dir=self.user_data_dir, + accept_downloads=self.accept_downloads, + downloads_path=self.downloads_path if self.accept_downloads else None, **browser_args ) self.default_context = self.browser else: self.browser = await self.playwright.chromium.launch(**browser_args) - + except Exception as e: # Fallback to chromium if Chrome channel fails if "chrome" in str(e) and browser_args.get("channel") == "chrome": @@ -565,6 +571,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): response_headers = {} status_code = None + # Reset downloaded files list for new crawl + self._downloaded_files = [] + self._cleanup_expired_sessions() session_id = kwargs.get("session_id") @@ -592,10 +601,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Normal context creation for non-persistent or non-Chrome browsers context = await self.browser.new_context( user_agent=self.user_agent, - viewport={"width": 1920, "height": 1080}, + viewport={"width": 1200, "height": 800}, proxy={"server": self.proxy} if self.proxy else None, - accept_downloads=True, - java_script_enabled=True + java_script_enabled=True, + accept_downloads=self.accept_downloads, + downloads_path=self.downloads_path if self.accept_downloads else None ) await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) await context.set_extra_http_headers(self.headers) @@ -655,6 +665,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): page.on("pageerror", lambda exc: print(f"Page Error: {exc}")) try: + # Set up download handling if enabled + if self.accept_downloads: + page.on("download", lambda download: asyncio.create_task(self._handle_download(download))) + if self.verbose: print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...") @@ -886,7 +900,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): response_headers=response_headers, status_code=status_code, screenshot=screenshot_data, - get_delayed_content=get_delayed_content + get_delayed_content=get_delayed_content, + downloaded_files=self._downloaded_files if self._downloaded_files else None ) return response except Error as e: @@ -896,6 +911,24 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # await page.close() # await context.close() + async def _handle_download(self, download): + """Handle file downloads.""" + try: + suggested_filename = download.suggested_filename + download_path = os.path.join(self.downloads_path, suggested_filename) + + if self.verbose: + print(f"[LOG] 📥 Downloading {suggested_filename} to {download_path}") + + await download.save_as(download_path) + self._downloaded_files.append(download_path) + + if self.verbose: + print(f"[LOG] ✅ Downloaded {suggested_filename} successfully") + except Exception as e: + if self.verbose: + print(f"[ERROR] Failed to handle download: {str(e)}") + async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed semaphore = asyncio.Semaphore(semaphore_count) diff --git a/crawl4ai/async_crawler_strategy_0.3.73.py b/crawl4ai/async_crawler_strategy_0.3.73.py deleted file mode 100644 index 54835dad..00000000 --- a/crawl4ai/async_crawler_strategy_0.3.73.py +++ /dev/null @@ -1,965 +0,0 @@ -import asyncio -import base64 -import time -from abc import ABC, abstractmethod -from typing import Callable, Dict, Any, List, Optional, Awaitable -import os, sys, shutil -import tempfile, subprocess -from playwright.async_api import async_playwright, Page, Browser, Error -from io import BytesIO -from PIL import Image, ImageDraw, ImageFont -from pathlib import Path -from playwright.async_api import ProxySettings -from pydantic import BaseModel -import hashlib -import json -import uuid - -from playwright_stealth import StealthConfig, stealth_async - -stealth_config = StealthConfig( - webdriver=True, - chrome_app=True, - chrome_csi=True, - chrome_load_times=True, - chrome_runtime=True, - navigator_languages=True, - navigator_plugins=True, - navigator_permissions=True, - webgl_vendor=True, - outerdimensions=True, - navigator_hardware_concurrency=True, - media_codecs=True, -) - - -class ManagedBrowser: - def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False): - self.browser_type = browser_type - self.user_data_dir = user_data_dir - self.headless = headless - self.browser_process = None - self.temp_dir = None - self.debugging_port = 9222 - - async def start(self) -> str: - """ - Starts the browser process and returns the CDP endpoint URL. - If user_data_dir is not provided, creates a temporary directory. - """ - - # Create temp dir if needed - if not self.user_data_dir: - self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-") - self.user_data_dir = self.temp_dir - - # Get browser path and args based on OS and browser type - browser_path = self._get_browser_path() - args = self._get_browser_args() - - # Start browser process - try: - self.browser_process = subprocess.Popen( - args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE - ) - await asyncio.sleep(2) # Give browser time to start - return f"http://localhost:{self.debugging_port}" - except Exception as e: - await self.cleanup() - raise Exception(f"Failed to start browser: {e}") - - def _get_browser_path(self) -> str: - """Returns the browser executable path based on OS and browser type""" - if sys.platform == "darwin": # macOS - paths = { - "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", - "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox", - "webkit": "/Applications/Safari.app/Contents/MacOS/Safari" - } - elif sys.platform == "win32": # Windows - paths = { - "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", - "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe", - "webkit": None # WebKit not supported on Windows - } - else: # Linux - paths = { - "chromium": "google-chrome", - "firefox": "firefox", - "webkit": None # WebKit not supported on Linux - } - - return paths.get(self.browser_type) - - def _get_browser_args(self) -> List[str]: - """Returns browser-specific command line arguments""" - base_args = [self._get_browser_path()] - - if self.browser_type == "chromium": - args = [ - f"--remote-debugging-port={self.debugging_port}", - f"--user-data-dir={self.user_data_dir}", - ] - if self.headless: - args.append("--headless=new") - elif self.browser_type == "firefox": - args = [ - "--remote-debugging-port", str(self.debugging_port), - "--profile", self.user_data_dir, - ] - if self.headless: - args.append("--headless") - else: - raise NotImplementedError(f"Browser type {self.browser_type} not supported") - - return base_args + args - - async def cleanup(self): - """Cleanup browser process and temporary directory""" - if self.browser_process: - try: - self.browser_process.terminate() - await asyncio.sleep(1) - if self.browser_process.poll() is None: - self.browser_process.kill() - except Exception as e: - print(f"Error terminating browser: {e}") - - if self.temp_dir and os.path.exists(self.temp_dir): - try: - shutil.rmtree(self.temp_dir) - except Exception as e: - print(f"Error removing temporary directory: {e}") - -class AsyncCrawlResponse(BaseModel): - html: str - response_headers: Dict[str, str] - status_code: int - screenshot: Optional[str] = None - get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None - - class Config: - arbitrary_types_allowed = True - -class AsyncCrawlerStrategy(ABC): - @abstractmethod - async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: - pass - - @abstractmethod - async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: - pass - - @abstractmethod - async def take_screenshot(self, **kwargs) -> str: - pass - - @abstractmethod - def update_user_agent(self, user_agent: str): - pass - - @abstractmethod - def set_hook(self, hook_type: str, hook: Callable): - pass - -class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - def __init__(self, use_cached_html=False, js_code=None, **kwargs): - self.use_cached_html = use_cached_html - self.user_agent = kwargs.get( - "user_agent", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" - ) - self.proxy = kwargs.get("proxy") - self.proxy_config = kwargs.get("proxy_config") - self.headless = kwargs.get("headless", True) - self.browser_type = kwargs.get("browser_type", "chromium") - self.headers = kwargs.get("headers", {}) - self.sessions = {} - self.session_ttl = 1800 - self.js_code = js_code - self.verbose = kwargs.get("verbose", False) - self.playwright = None - self.browser = None - self.sleep_on_close = kwargs.get("sleep_on_close", False) - self.use_managed_browser = kwargs.get("use_managed_browser", False) - self.user_data_dir = kwargs.get("user_data_dir", None) - self.use_persistent_context = kwargs.get("use_persistent_context", False) - self.chrome_channel = kwargs.get("chrome_channel", "chrome") - self.managed_browser = None - self.default_context = None - self.hooks = { - 'on_browser_created': None, - 'on_user_agent_updated': None, - 'on_execution_started': None, - 'before_goto': None, - 'after_goto': None, - 'before_return_html': None, - 'before_retrieve_html': None - } - self.extra_args = kwargs.get("extra_args", []) - - async def __aenter__(self): - await self.start() - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb): - await self.close() - - async def start(self): - if self.playwright is None: - self.playwright = await async_playwright().start() - if self.browser is None: - if self.use_managed_browser: - # Use managed browser approach - self.managed_browser = ManagedBrowser( - browser_type=self.browser_type, - user_data_dir=self.user_data_dir, - headless=self.headless - ) - cdp_url = await self.managed_browser.start() - self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) - - # Get the default context that maintains the user profile - contexts = self.browser.contexts - if contexts: - self.default_context = contexts[0] - else: - # If no default context exists, create one - self.default_context = await self.browser.new_context( - viewport={"width": 1920, "height": 1080} - ) - - # Set up the default context - if self.default_context: - await self.default_context.set_extra_http_headers(self.headers) - - if self.user_agent: - await self.default_context.set_extra_http_headers({ - "User-Agent": self.user_agent - }) - else: - browser_args = { - "headless": self.headless, - "args": [ - "--disable-gpu", - "--no-sandbox", - "--disable-dev-shm-usage", - "--disable-blink-features=AutomationControlled", - "--disable-infobars", - "--window-position=0,0", - "--ignore-certificate-errors", - "--ignore-certificate-errors-spki-list", - # "--disable-http2", - # "--headless=new", # Use the new headless mode - ] - } - - # Add extra args if provided - if self.extra_args: - browser_args["args"].extend(self.extra_args) - - # Add proxy settings if a proxy is specified - if self.proxy: - proxy_settings = ProxySettings(server=self.proxy) - browser_args["proxy"] = proxy_settings - elif self.proxy_config: - proxy_settings = ProxySettings(server=self.proxy_config.get("server"), username=self.proxy_config.get("username"), password=self.proxy_config.get("password")) - browser_args["proxy"] = proxy_settings - - # Select the appropriate browser based on the browser_type - if self.browser_type == "firefox": - self.browser = await self.playwright.firefox.launch(**browser_args) - elif self.browser_type == "webkit": - self.browser = await self.playwright.webkit.launch(**browser_args) - else: - self.browser = await self.playwright.chromium.launch(**browser_args) - - # Update the headless configuration - if self.headless: - # Use the new headless mode explicitly - browser_args["args"].append("--headless=new") - - await self.execute_hook('on_browser_created', self.browser) - - async def close(self): - if self.sleep_on_close: - await asyncio.sleep(0.5) - - # Close all active sessions - session_ids = list(self.sessions.keys()) - for session_id in session_ids: - await self.kill_session(session_id) - - if self.browser: - await self.browser.close() - self.browser = None - - if self.managed_browser: - await self.managed_browser.cleanup() - self.managed_browser = None - - if self.playwright: - await self.playwright.stop() - self.playwright = None - - def __del__(self): - if self.browser or self.playwright: - asyncio.get_event_loop().run_until_complete(self.close()) - - def set_hook(self, hook_type: str, hook: Callable): - if hook_type in self.hooks: - self.hooks[hook_type] = hook - else: - raise ValueError(f"Invalid hook type: {hook_type}") - - async def execute_hook(self, hook_type: str, *args): - hook = self.hooks.get(hook_type) - if hook: - if asyncio.iscoroutinefunction(hook): - return await hook(*args) - else: - return hook(*args) - return args[0] if args else None - - def update_user_agent(self, user_agent: str): - self.user_agent = user_agent - - def set_custom_headers(self, headers: Dict[str, str]): - self.headers = headers - - async def kill_session(self, session_id: str): - if session_id in self.sessions: - context, page, _ = self.sessions[session_id] - await page.close() - if not self.use_managed_browser: - await context.close() - del self.sessions[session_id] - - def _cleanup_expired_sessions(self): - current_time = time.time() - expired_sessions = [ - sid for sid, (_, _, last_used) in self.sessions.items() - if current_time - last_used > self.session_ttl - ] - for sid in expired_sessions: - asyncio.create_task(self.kill_session(sid)) - - async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): - wait_for = wait_for.strip() - - if wait_for.startswith('js:'): - # Explicitly specified JavaScript - js_code = wait_for[3:].strip() - return await self.csp_compliant_wait(page, js_code, timeout) - elif wait_for.startswith('css:'): - # Explicitly specified CSS selector - css_selector = wait_for[4:].strip() - try: - await page.wait_for_selector(css_selector, timeout=timeout) - except Error as e: - if 'Timeout' in str(e): - raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'") - else: - raise ValueError(f"Invalid CSS selector: '{css_selector}'") - else: - # Auto-detect based on content - if wait_for.startswith('()') or wait_for.startswith('function'): - # It's likely a JavaScript function - return await self.csp_compliant_wait(page, wait_for, timeout) - else: - # Assume it's a CSS selector first - try: - await page.wait_for_selector(wait_for, timeout=timeout) - except Error as e: - if 'Timeout' in str(e): - raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'") - else: - # If it's not a timeout error, it might be an invalid selector - # Let's try to evaluate it as a JavaScript function as a fallback - try: - return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout) - except Error: - raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. " - "It should be either a valid CSS selector, a JavaScript function, " - "or explicitly prefixed with 'js:' or 'css:'.") - - async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000): - wrapper_js = f""" - async () => {{ - const userFunction = {user_wait_function}; - const startTime = Date.now(); - while (true) {{ - if (await userFunction()) {{ - return true; - }} - if (Date.now() - startTime > {timeout}) {{ - throw new Error('Timeout waiting for condition'); - }} - await new Promise(resolve => setTimeout(resolve, 100)); - }} - }} - """ - - try: - await page.evaluate(wrapper_js) - except TimeoutError: - raise TimeoutError(f"Timeout after {timeout}ms waiting for condition") - except Exception as e: - raise RuntimeError(f"Error in wait condition: {str(e)}") - - async def process_iframes(self, page): - # Find all iframes - iframes = await page.query_selector_all('iframe') - - for i, iframe in enumerate(iframes): - try: - # Add a unique identifier to the iframe - await iframe.evaluate(f'(element) => element.id = "iframe-{i}"') - - # Get the frame associated with this iframe - frame = await iframe.content_frame() - - if frame: - # Wait for the frame to load - await frame.wait_for_load_state('load', timeout=30000) # 30 seconds timeout - - # Extract the content of the iframe's body - iframe_content = await frame.evaluate('() => document.body.innerHTML') - - # Generate a unique class name for this iframe - class_name = f'extracted-iframe-content-{i}' - - # Replace the iframe with a div containing the extracted content - _iframe = iframe_content.replace('`', '\\`') - await page.evaluate(f""" - () => {{ - const iframe = document.getElementById('iframe-{i}'); - const div = document.createElement('div'); - div.innerHTML = `{_iframe}`; - div.className = '{class_name}'; - iframe.replaceWith(div); - }} - """) - else: - print(f"Warning: Could not access content frame for iframe {i}") - except Exception as e: - print(f"Error processing iframe {i}: {str(e)}") - - # Return the page object - return page - - async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: - response_headers = {} - status_code = None - - self._cleanup_expired_sessions() - session_id = kwargs.get("session_id") - - # Handle page creation differently for managed browser - if self.use_managed_browser: - if session_id: - # Reuse existing session if available - context, page, _ = self.sessions.get(session_id, (None, None, None)) - if not page: - # Create new page in default context if session doesn't exist - page = await self.default_context.new_page() - self.sessions[session_id] = (self.default_context, page, time.time()) - else: - # Create new page in default context for non-session requests - page = await self.default_context.new_page() - else: - if session_id: - context, page, _ = self.sessions.get(session_id, (None, None, None)) - if not context: - context = await self.browser.new_context( - user_agent=self.user_agent, - viewport={"width": 1920, "height": 1080}, - proxy={"server": self.proxy} if self.proxy else None, - accept_downloads=True, - java_script_enabled=True - ) - await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) - await context.set_extra_http_headers(self.headers) - page = await context.new_page() - self.sessions[session_id] = (context, page, time.time()) - else: - context = await self.browser.new_context( - user_agent=self.user_agent, - viewport={"width": 1920, "height": 1080}, - proxy={"server": self.proxy} if self.proxy else None - ) - await context.set_extra_http_headers(self.headers) - - if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False): - # Inject scripts to override navigator properties - await context.add_init_script(""" - // Pass the Permissions Test. - const originalQuery = window.navigator.permissions.query; - window.navigator.permissions.query = (parameters) => ( - parameters.name === 'notifications' ? - Promise.resolve({ state: Notification.permission }) : - originalQuery(parameters) - ); - Object.defineProperty(navigator, 'webdriver', { - get: () => undefined - }); - window.navigator.chrome = { - runtime: {}, - // Add other properties if necessary - }; - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5], - }); - Object.defineProperty(navigator, 'languages', { - get: () => ['en-US', 'en'], - }); - Object.defineProperty(document, 'hidden', { - get: () => false - }); - Object.defineProperty(document, 'visibilityState', { - get: () => 'visible' - }); - """) - - page = await context.new_page() - if kwargs.get("magic", False): - await stealth_async(page, stealth_config) - - # Add console message and error logging - if kwargs.get("log_console", False): - page.on("console", lambda msg: print(f"Console: {msg.text}")) - page.on("pageerror", lambda exc: print(f"Page Error: {exc}")) - - try: - if self.verbose: - print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...") - - if self.use_cached_html: - cache_file_path = os.path.join( - Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() - ) - if os.path.exists(cache_file_path): - html = "" - with open(cache_file_path, "r") as f: - html = f.read() - # retrieve response headers and status code from cache - with open(cache_file_path + ".meta", "r") as f: - meta = json.load(f) - response_headers = meta.get("response_headers", {}) - status_code = meta.get("status_code") - response = AsyncCrawlResponse( - html=html, response_headers=response_headers, status_code=status_code - ) - return response - - if not kwargs.get("js_only", False): - await self.execute_hook('before_goto', page) - - # response = await page.goto( - # url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000) - # ) - - # Add retry logic for HTTP2 errors - max_retries = kwargs.get("max_retries", 3) - current_try = 0 - - while current_try < max_retries: - try: - response = await page.goto( - url, - # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), - wait_until=kwargs.get("wait_until", "networkidle"), - timeout=kwargs.get("page_timeout", 60000) - ) - break - except Exception as e: - current_try += 1 - if "ERR_HTTP2_PROTOCOL_ERROR" in str(e): - if current_try < max_retries: - # Add exponential backoff - await asyncio.sleep(2 ** current_try) - # Try with different protocol - if 'args' not in kwargs: - kwargs['args'] = [] - kwargs['args'].extend(['--disable-http2']) - continue - if current_try == max_retries: - raise - - # response = await page.goto("about:blank") - # await page.evaluate(f"window.location.href = '{url}'") - - await self.execute_hook('after_goto', page) - - # Get status code and headers - status_code = response.status - response_headers = response.headers - else: - status_code = 200 - response_headers = {} - - # Replace the current wait_for_selector line with this more robust check: - try: - # First wait for body to exist, regardless of visibility - await page.wait_for_selector('body', state='attached', timeout=30000) - - # Then wait for it to become visible by checking CSS - await page.wait_for_function(""" - () => { - const body = document.body; - const style = window.getComputedStyle(body); - return style.display !== 'none' && - style.visibility !== 'hidden' && - style.opacity !== '0'; - } - """, timeout=30000) - - except Error as e: - # If waiting fails, let's try to diagnose the issue - visibility_info = await page.evaluate(""" - () => { - const body = document.body; - const style = window.getComputedStyle(body); - return { - display: style.display, - visibility: style.visibility, - opacity: style.opacity, - hasContent: body.innerHTML.length, - classList: Array.from(body.classList) - } - } - """) - - if self.verbose: - print(f"Body visibility debug info: {visibility_info}") - - # Even if body is hidden, we might still want to proceed - if kwargs.get('ignore_body_visibility', True): - if self.verbose: - print("Proceeding despite hidden body...") - pass - else: - raise Error(f"Body element is hidden: {visibility_info}") - - await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") - - js_code = kwargs.get("js_code", kwargs.get("js", self.js_code)) - if js_code: - if isinstance(js_code, str): - await page.evaluate(js_code) - elif isinstance(js_code, list): - for js in js_code: - await page.evaluate(js) - - await page.wait_for_load_state('networkidle') - # Check for on execution event - await self.execute_hook('on_execution_started', page) - - if kwargs.get("simulate_user", False) or kwargs.get("magic", False): - # Simulate user interactions - await page.mouse.move(100, 100) - await page.mouse.down() - await page.mouse.up() - await page.keyboard.press('ArrowDown') - - # Handle the wait_for parameter - wait_for = kwargs.get("wait_for") - if wait_for: - try: - await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000)) - except Exception as e: - raise RuntimeError(f"Wait condition failed: {str(e)}") - - # Update image dimensions - update_image_dimensions_js = """ - () => { - return new Promise((resolve) => { - const filterImage = (img) => { - // Filter out images that are too small - if (img.width < 100 && img.height < 100) return false; - - // Filter out images that are not visible - const rect = img.getBoundingClientRect(); - if (rect.width === 0 || rect.height === 0) return false; - - // Filter out images with certain class names (e.g., icons, thumbnails) - if (img.classList.contains('icon') || img.classList.contains('thumbnail')) return false; - - // Filter out images with certain patterns in their src (e.g., placeholder images) - if (img.src.includes('placeholder') || img.src.includes('icon')) return false; - - return true; - }; - - const images = Array.from(document.querySelectorAll('img')).filter(filterImage); - let imagesLeft = images.length; - - if (imagesLeft === 0) { - resolve(); - return; - } - - const checkImage = (img) => { - if (img.complete && img.naturalWidth !== 0) { - img.setAttribute('width', img.naturalWidth); - img.setAttribute('height', img.naturalHeight); - imagesLeft--; - if (imagesLeft === 0) resolve(); - } - }; - - images.forEach(img => { - checkImage(img); - if (!img.complete) { - img.onload = () => { - checkImage(img); - }; - img.onerror = () => { - imagesLeft--; - if (imagesLeft === 0) resolve(); - }; - } - }); - - // Fallback timeout of 5 seconds - // setTimeout(() => resolve(), 5000); - resolve(); - }); - } - """ - await page.evaluate(update_image_dimensions_js) - - # Wait a bit for any onload events to complete - await page.wait_for_timeout(100) - - # Process iframes - if kwargs.get("process_iframes", False): - page = await self.process_iframes(page) - - await self.execute_hook('before_retrieve_html', page) - # Check if delay_before_return_html is set then wait for that time - delay_before_return_html = kwargs.get("delay_before_return_html") - if delay_before_return_html: - await asyncio.sleep(delay_before_return_html) - - # Check for remove_overlay_elements parameter - if kwargs.get("remove_overlay_elements", False): - await self.remove_overlay_elements(page) - - html = await page.content() - await self.execute_hook('before_return_html', page, html) - - # Check if kwargs has screenshot=True then take screenshot - screenshot_data = None - if kwargs.get("screenshot"): - # Check we have screenshot_wait_for parameter, if we have simply wait for that time - screenshot_wait_for = kwargs.get("screenshot_wait_for") - if screenshot_wait_for: - await asyncio.sleep(screenshot_wait_for) - screenshot_data = await self.take_screenshot(page) - - if self.verbose: - print(f"[LOG] ✅ Crawled {url} successfully!") - - if self.use_cached_html: - cache_file_path = os.path.join( - Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() - ) - with open(cache_file_path, "w", encoding="utf-8") as f: - f.write(html) - # store response headers and status code in cache - with open(cache_file_path + ".meta", "w", encoding="utf-8") as f: - json.dump({ - "response_headers": response_headers, - "status_code": status_code - }, f) - - async def get_delayed_content(delay: float = 5.0) -> str: - if self.verbose: - print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}") - await asyncio.sleep(delay) - return await page.content() - - response = AsyncCrawlResponse( - html=html, - response_headers=response_headers, - status_code=status_code, - screenshot=screenshot_data, - get_delayed_content=get_delayed_content - ) - return response - except Error as e: - raise Error(f"[ERROR] 🚫 crawl(): Failed to crawl {url}: {str(e)}") - # finally: - # if not session_id: - # await page.close() - # await context.close() - - async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: - semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed - semaphore = asyncio.Semaphore(semaphore_count) - - async def crawl_with_semaphore(url): - async with semaphore: - return await self.crawl(url, **kwargs) - - tasks = [crawl_with_semaphore(url) for url in urls] - results = await asyncio.gather(*tasks, return_exceptions=True) - return [result if not isinstance(result, Exception) else str(result) for result in results] - - async def remove_overlay_elements(self, page: Page) -> None: - """ - Removes popup overlays, modals, cookie notices, and other intrusive elements from the page. - - Args: - page (Page): The Playwright page instance - """ - remove_overlays_js = """ - async () => { - // Function to check if element is visible - const isVisible = (elem) => { - const style = window.getComputedStyle(elem); - return style.display !== 'none' && - style.visibility !== 'hidden' && - style.opacity !== '0'; - }; - - // Common selectors for popups and overlays - const commonSelectors = [ - // Close buttons first - 'button[class*="close" i]', 'button[class*="dismiss" i]', - 'button[aria-label*="close" i]', 'button[title*="close" i]', - 'a[class*="close" i]', 'span[class*="close" i]', - - // Cookie notices - '[class*="cookie-banner" i]', '[id*="cookie-banner" i]', - '[class*="cookie-consent" i]', '[id*="cookie-consent" i]', - - // Newsletter/subscription dialogs - '[class*="newsletter" i]', '[class*="subscribe" i]', - - // Generic popups/modals - '[class*="popup" i]', '[class*="modal" i]', - '[class*="overlay" i]', '[class*="dialog" i]', - '[role="dialog"]', '[role="alertdialog"]' - ]; - - // Try to click close buttons first - for (const selector of commonSelectors.slice(0, 6)) { - const closeButtons = document.querySelectorAll(selector); - for (const button of closeButtons) { - if (isVisible(button)) { - try { - button.click(); - await new Promise(resolve => setTimeout(resolve, 100)); - } catch (e) { - console.log('Error clicking button:', e); - } - } - } - } - - // Remove remaining overlay elements - const removeOverlays = () => { - // Find elements with high z-index - const allElements = document.querySelectorAll('*'); - for (const elem of allElements) { - const style = window.getComputedStyle(elem); - const zIndex = parseInt(style.zIndex); - const position = style.position; - - if ( - isVisible(elem) && - (zIndex > 999 || position === 'fixed' || position === 'absolute') && - ( - elem.offsetWidth > window.innerWidth * 0.5 || - elem.offsetHeight > window.innerHeight * 0.5 || - style.backgroundColor.includes('rgba') || - parseFloat(style.opacity) < 1 - ) - ) { - elem.remove(); - } - } - - // Remove elements matching common selectors - for (const selector of commonSelectors) { - const elements = document.querySelectorAll(selector); - elements.forEach(elem => { - if (isVisible(elem)) { - elem.remove(); - } - }); - } - }; - - // Remove overlay elements - removeOverlays(); - - // Remove any fixed/sticky position elements at the top/bottom - const removeFixedElements = () => { - const elements = document.querySelectorAll('*'); - elements.forEach(elem => { - const style = window.getComputedStyle(elem); - if ( - (style.position === 'fixed' || style.position === 'sticky') && - isVisible(elem) - ) { - elem.remove(); - } - }); - }; - - removeFixedElements(); - - // Remove empty block elements as: div, p, span, etc. - const removeEmptyBlockElements = () => { - const blockElements = document.querySelectorAll('div, p, span, section, article, header, footer, aside, nav, main, ul, ol, li, dl, dt, dd, h1, h2, h3, h4, h5, h6'); - blockElements.forEach(elem => { - if (elem.innerText.trim() === '') { - elem.remove(); - } - }); - }; - - // Remove margin-right and padding-right from body (often added by modal scripts) - document.body.style.marginRight = '0px'; - document.body.style.paddingRight = '0px'; - document.body.style.overflow = 'auto'; - - // Wait a bit for any animations to complete - await new Promise(resolve => setTimeout(resolve, 100)); - } - """ - - try: - await page.evaluate(remove_overlays_js) - await page.wait_for_timeout(500) # Wait for any animations to complete - except Exception as e: - if self.verbose: - print(f"Warning: Failed to remove overlay elements: {str(e)}") - - async def take_screenshot(self, page: Page) -> str: - try: - # The page is already loaded, just take the screenshot - screenshot = await page.screenshot(full_page=True) - return base64.b64encode(screenshot).decode('utf-8') - except Exception as e: - error_message = f"Failed to take screenshot: {str(e)}" - print(error_message) - - # Generate an error image - img = Image.new('RGB', (800, 600), color='black') - draw = ImageDraw.Draw(img) - font = ImageFont.load_default() - draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) - - buffered = BytesIO() - img.save(buffered, format="JPEG") - return base64.b64encode(buffered.getvalue()).decode('utf-8') - finally: - await page.close() - diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index 273ca6c9..c52e3db6 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -91,7 +91,8 @@ class AsyncDatabaseManager: links TEXT DEFAULT "{}", metadata TEXT DEFAULT "{}", screenshot TEXT DEFAULT "", - response_headers TEXT DEFAULT "{}" -- New column added + response_headers TEXT DEFAULT "{}", + downloaded_files TEXT DEFAULT "{}" -- New column added ) ''') @@ -108,7 +109,7 @@ class AsyncDatabaseManager: column_names = await self.execute_with_retry(_check_columns) # List of new columns to add - new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers'] + new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files'] for column in new_columns: if column not in column_names: @@ -130,7 +131,7 @@ class AsyncDatabaseManager: async def _get(db): async with db.execute( ''' - SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers + SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers, downloaded_files FROM crawled_data WHERE url = ? ''', (url,) @@ -149,7 +150,8 @@ class AsyncDatabaseManager: json.loads(row[7] or '{}'), # links json.loads(row[8] or '{}'), # metadata row[9], # screenshot - json.loads(row[10] or '{}') # response_headers + json.loads(row[10] or '{}'), # response_headers + json.loads(row[11] or '[]') # downloaded_files ) return None @@ -171,15 +173,16 @@ class AsyncDatabaseManager: links: str = "{}", metadata: str = "{}", screenshot: str = "", - response_headers: str = "{}" # New parameter added + response_headers: str = "{}", + downloaded_files: str = "[]" ): """Cache URL data with retry logic""" async def _cache(db): await db.execute(''' INSERT INTO crawled_data ( - url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers + url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers, downloaded_files ) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(url) DO UPDATE SET html = excluded.html, cleaned_html = excluded.cleaned_html, @@ -190,8 +193,9 @@ class AsyncDatabaseManager: links = excluded.links, metadata = excluded.metadata, screenshot = excluded.screenshot, - response_headers = excluded.response_headers -- Update response_headers - ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers)) + response_headers = excluded.response_headers, -- Update response_headers + downloaded_files = excluded.downloaded_files + ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers, downloaded_files)) try: await self.execute_with_retry(_cache) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 8415f9b9..cec1ace0 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -160,12 +160,35 @@ class AsyncWebCrawler: if async_response: crawl_result.status_code = async_response.status_code crawl_result.response_headers = async_response.response_headers + crawl_result.downloaded_files = async_response.downloaded_files else: crawl_result.status_code = 200 crawl_result.response_headers = cached[10] + # crawl_result.downloaded_files = cached[11] crawl_result.success = bool(html) crawl_result.session_id = kwargs.get("session_id", None) + + + if not is_raw_html: + if not bool(cached) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: + await async_db_manager.acache_url( + url = url, + html = html, + cleaned_html = crawl_result.cleaned_html, + markdown = crawl_result.markdown, + extracted_content = extracted_content, + success = True, + media = json.dumps(crawl_result.media), + links = json.dumps(crawl_result.links), + metadata = json.dumps(crawl_result.metadata), + screenshot=screenshot, + response_headers=json.dumps(crawl_result.response_headers), + downloaded_files=json.dumps(crawl_result.downloaded_files), + + ) + + return crawl_result except Exception as e: @@ -233,8 +256,6 @@ class AsyncWebCrawler: css_selector: str, screenshot: str, verbose: bool, - is_cached: bool, - async_response: Optional[AsyncCrawlResponse], **kwargs, ) -> CrawlResult: t = time.time() @@ -298,28 +319,6 @@ class AsyncWebCrawler: screenshot = None if not screenshot else screenshot - response_headers = "{}" # Default value - if async_response: - # Serialize response_headers dict to JSON string - response_headers = json.dumps(async_response.response_headers, ensure_ascii=False) - - - if not kwargs.get("is_raw_html", False): - if not is_cached or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: - await async_db_manager.acache_url( - url, - html, - cleaned_html, - markdown, - extracted_content, - True, - json.dumps(media), - json.dumps(links), - json.dumps(metadata), - screenshot=screenshot, - response_headers=response_headers, - ) - return CrawlResult( url=url, html=html, diff --git a/crawl4ai/content_cleaning_strategy.py b/crawl4ai/content_cleaning_strategy.py deleted file mode 100644 index b8a5053d..00000000 --- a/crawl4ai/content_cleaning_strategy.py +++ /dev/null @@ -1,198 +0,0 @@ -from bs4 import BeautifulSoup, Tag -import re -from typing import Optional - -class ContentCleaningStrategy: - def __init__(self): - # Precompile regex patterns for performance - self.negative_patterns = re.compile(r'nav|footer|header|sidebar|ads|comment', re.I) - self.positive_patterns = re.compile(r'content|article|main|post', re.I) - self.priority_tags = {'article', 'main', 'section', 'div'} - self.non_content_tags = {'nav', 'footer', 'header', 'aside'} - # Thresholds - self.text_density_threshold = 9.0 - self.min_word_count = 50 - self.link_density_threshold = 0.2 - self.max_dom_depth = 10 # To prevent excessive DOM traversal - - def clean(self, clean_html: str, soup = None) -> str: - """ - Main function that takes cleaned HTML and returns super cleaned HTML. - - Args: - clean_html (str): The cleaned HTML content. - - Returns: - str: The super cleaned HTML containing only the main content. - """ - try: - if not clean_html or not isinstance(clean_html, str): - return '' - if not soup: - # soup = BeautifulSoup(clean_html, 'html.parser') - soup = BeautifulSoup(clean_html, 'lxml') - main_content = self.extract_main_content(soup) - if main_content: - super_clean_element = self.clean_element(main_content) - return super_clean_element.encode_contents().decode('utf-8') - else: - return '' - except Exception: - # Handle exceptions silently or log them as needed - return '' - - def extract_main_content(self, soup) -> Optional[Tag]: - """ - Identifies and extracts the main content element from the HTML. - - Args: - soup (BeautifulSoup): The parsed HTML soup. - - Returns: - Optional[Tag]: The Tag object containing the main content, or None if not found. - """ - candidates = [] - for element in soup.find_all(self.priority_tags): - if self.is_non_content_tag(element): - continue - if self.has_negative_class_id(element): - continue - score = self.calculate_content_score(element) - candidates.append((score, element)) - - if not candidates: - return None - - # Sort candidates by score in descending order - candidates.sort(key=lambda x: x[0], reverse=True) - # Select the element with the highest score - best_element = candidates[0][1] - return best_element - - def calculate_content_score(self, element: Tag) -> float: - """ - Calculates a score for an element based on various heuristics. - - Args: - element (Tag): The HTML element to score. - - Returns: - float: The content score of the element. - """ - score = 0.0 - - if self.is_priority_tag(element): - score += 5.0 - if self.has_positive_class_id(element): - score += 3.0 - if self.has_negative_class_id(element): - score -= 3.0 - if self.is_high_text_density(element): - score += 2.0 - if self.is_low_link_density(element): - score += 2.0 - if self.has_sufficient_content(element): - score += 2.0 - if self.has_headings(element): - score += 3.0 - - dom_depth = self.calculate_dom_depth(element) - score += min(dom_depth, self.max_dom_depth) * 0.5 # Adjust weight as needed - - return score - - def is_priority_tag(self, element: Tag) -> bool: - """Checks if the element is a priority tag.""" - return element.name in self.priority_tags - - def is_non_content_tag(self, element: Tag) -> bool: - """Checks if the element is a non-content tag.""" - return element.name in self.non_content_tags - - def has_negative_class_id(self, element: Tag) -> bool: - """Checks if the element has negative indicators in its class or id.""" - class_id = ' '.join(filter(None, [ - self.get_attr_str(element.get('class')), - element.get('id', '') - ])) - return bool(self.negative_patterns.search(class_id)) - - def has_positive_class_id(self, element: Tag) -> bool: - """Checks if the element has positive indicators in its class or id.""" - class_id = ' '.join(filter(None, [ - self.get_attr_str(element.get('class')), - element.get('id', '') - ])) - return bool(self.positive_patterns.search(class_id)) - - @staticmethod - def get_attr_str(attr) -> str: - """Converts an attribute value to a string.""" - if isinstance(attr, list): - return ' '.join(attr) - elif isinstance(attr, str): - return attr - else: - return '' - - def is_high_text_density(self, element: Tag) -> bool: - """Determines if the element has high text density.""" - text_density = self.calculate_text_density(element) - return text_density > self.text_density_threshold - - def calculate_text_density(self, element: Tag) -> float: - """Calculates the text density of an element.""" - text_length = len(element.get_text(strip=True)) - tag_count = len(element.find_all()) - tag_count = tag_count or 1 # Prevent division by zero - return text_length / tag_count - - def is_low_link_density(self, element: Tag) -> bool: - """Determines if the element has low link density.""" - link_density = self.calculate_link_density(element) - return link_density < self.link_density_threshold - - def calculate_link_density(self, element: Tag) -> float: - """Calculates the link density of an element.""" - text = element.get_text(strip=True) - if not text: - return 0.0 - link_text = ' '.join(a.get_text(strip=True) for a in element.find_all('a')) - return len(link_text) / len(text) if text else 0.0 - - def has_sufficient_content(self, element: Tag) -> bool: - """Checks if the element has sufficient word count.""" - word_count = len(element.get_text(strip=True).split()) - return word_count >= self.min_word_count - - def calculate_dom_depth(self, element: Tag) -> int: - """Calculates the depth of an element in the DOM tree.""" - depth = 0 - current_element = element - while current_element.parent and depth < self.max_dom_depth: - depth += 1 - current_element = current_element.parent - return depth - - def has_headings(self, element: Tag) -> bool: - """Checks if the element contains heading tags.""" - return bool(element.find(['h1', 'h2', 'h3'])) - - def clean_element(self, element: Tag) -> Tag: - """ - Cleans the selected element by removing unnecessary attributes and nested non-content elements. - - Args: - element (Tag): The HTML element to clean. - - Returns: - Tag: The cleaned HTML element. - """ - for tag in element.find_all(['script', 'style', 'aside']): - tag.decompose() - for tag in element.find_all(): - attrs = dict(tag.attrs) - for attr in attrs: - if attr in ['style', 'onclick', 'onmouseover', 'align', 'bgcolor']: - del tag.attrs[attr] - return element diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py new file mode 100644 index 00000000..850ebf11 --- /dev/null +++ b/crawl4ai/content_filter_strategy.py @@ -0,0 +1,344 @@ +import os +import re +import time +from bs4 import BeautifulSoup, Tag +from typing import List, Tuple, Dict +from rank_bm25 import BM25Okapi +import nltk +from time import perf_counter +from html5lib import parse, treebuilders +from time import perf_counter +from collections import deque +from bs4 import BeautifulSoup, NavigableString, Tag +from .utils import clean_tokens +from abc import ABC, abstractmethod + +class RelevantContentFilter(ABC): + def __init__(self, user_query: str = None): + self.user_query = user_query + self.included_tags = { + # Primary structure + 'article', 'main', 'section', 'div', + # List structures + 'ul', 'ol', 'li', 'dl', 'dt', 'dd', + # Text content + 'p', 'span', 'blockquote', 'pre', 'code', + # Headers + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + # Tables + 'table', 'thead', 'tbody', 'tr', 'td', 'th', + # Other semantic elements + 'figure', 'figcaption', 'details', 'summary', + # Text formatting + 'em', 'strong', 'b', 'i', 'mark', 'small', + # Rich content + 'time', 'address', 'cite', 'q' + } + self.excluded_tags = { + 'nav', 'footer', 'header', 'aside', 'script', + 'style', 'form', 'iframe', 'noscript' + } + self.header_tags = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'} + self.negative_patterns = re.compile( + r'nav|footer|header|sidebar|ads|comment|promo|advert|social|share', + re.I + ) + self.min_word_count = 2 + + @abstractmethod + def filter_content(self, html: str) -> List[str]: + """Abstract method to be implemented by specific filtering strategies""" + pass + + def extract_page_query(self, soup: BeautifulSoup, body: Tag) -> str: + """Common method to extract page metadata with fallbacks""" + if self.user_query: + return self.user_query + + query_parts = [] + + # Title + if soup.title: + query_parts.append(soup.title.string) + elif soup.find('h1'): + query_parts.append(soup.find('h1').get_text()) + + # Meta tags + temp = "" + for meta_name in ['keywords', 'description']: + meta = soup.find('meta', attrs={'name': meta_name}) + if meta and meta.get('content'): + query_parts.append(meta['content']) + temp += meta['content'] + + # If still empty, grab first significant paragraph + if not temp: + # Find the first tag P thatits text contains more than 50 characters + for p in body.find_all('p'): + if len(p.get_text()) > 150: + query_parts.append(p.get_text()[:150]) + break + + return ' '.join(filter(None, query_parts)) + + + def extract_text_chunks(self, body: Tag) -> List[Tuple[str, str]]: + """ + Extracts text chunks from a BeautifulSoup body element while preserving order. + Returns list of tuples (text, tag_name) for classification. + + Args: + body: BeautifulSoup Tag object representing the body element + + Returns: + List of (text, tag_name) tuples + """ + # Tags to ignore - inline elements that shouldn't break text flow + INLINE_TAGS = { + 'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'br', 'button', 'cite', 'code', + 'dfn', 'em', 'i', 'img', 'input', 'kbd', 'label', 'map', 'object', 'q', + 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', 'sup', + 'textarea', 'time', 'tt', 'var' + } + + # Tags that typically contain meaningful headers + HEADER_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header'} + + chunks = [] + current_text = [] + chunk_index = 0 + + def should_break_chunk(tag: Tag) -> bool: + """Determine if a tag should cause a break in the current text chunk""" + return ( + tag.name not in INLINE_TAGS + and not (tag.name == 'p' and len(current_text) == 0) + ) + + # Use deque for efficient push/pop operations + stack = deque([(body, False)]) + + while stack: + element, visited = stack.pop() + + if visited: + # End of block element - flush accumulated text + if current_text and should_break_chunk(element): + text = ' '.join(''.join(current_text).split()) + if text: + tag_type = 'header' if element.name in HEADER_TAGS else 'content' + chunks.append((chunk_index, text, tag_type, element)) + chunk_index += 1 + current_text = [] + continue + + if isinstance(element, NavigableString): + if str(element).strip(): + current_text.append(str(element).strip()) + continue + + # Pre-allocate children to avoid multiple list operations + children = list(element.children) + if not children: + continue + + # Mark block for revisit after processing children + stack.append((element, True)) + + # Add children in reverse order for correct processing + for child in reversed(children): + if isinstance(child, (Tag, NavigableString)): + stack.append((child, False)) + + # Handle any remaining text + if current_text: + text = ' '.join(''.join(current_text).split()) + if text: + chunks.append((chunk_index, text, 'content', body)) + + return chunks + + + def extract_text_chunks1(self, soup: BeautifulSoup) -> List[Tuple[int, str, Tag]]: + """Common method for extracting text chunks""" + _text_cache = {} + def fast_text(element: Tag) -> str: + elem_id = id(element) + if elem_id in _text_cache: + return _text_cache[elem_id] + texts = [] + for content in element.contents: + if isinstance(content, str): + text = content.strip() + if text: + texts.append(text) + result = ' '.join(texts) + _text_cache[elem_id] = result + return result + + candidates = [] + index = 0 + + def dfs(element): + nonlocal index + if isinstance(element, Tag): + if element.name in self.included_tags: + if not self.is_excluded(element): + text = fast_text(element) + word_count = len(text.split()) + + # Headers pass through with adjusted minimum + if element.name in self.header_tags: + if word_count >= 3: # Minimal sanity check for headers + candidates.append((index, text, element)) + index += 1 + # Regular content uses standard minimum + elif word_count >= self.min_word_count: + candidates.append((index, text, element)) + index += 1 + + for child in element.children: + dfs(child) + + dfs(soup.body if soup.body else soup) + return candidates + + def is_excluded(self, tag: Tag) -> bool: + """Common method for exclusion logic""" + if tag.name in self.excluded_tags: + return True + class_id = ' '.join(filter(None, [ + ' '.join(tag.get('class', [])), + tag.get('id', '') + ])) + return bool(self.negative_patterns.search(class_id)) + + def clean_element(self, tag: Tag) -> str: + """Common method for cleaning HTML elements with minimal overhead""" + if not tag or not isinstance(tag, Tag): + return "" + + unwanted_tags = {'script', 'style', 'aside', 'form', 'iframe', 'noscript'} + unwanted_attrs = {'style', 'onclick', 'onmouseover', 'align', 'bgcolor', 'class', 'id'} + + # Use string builder pattern for better performance + builder = [] + + def render_tag(elem): + if not isinstance(elem, Tag): + if isinstance(elem, str): + builder.append(elem.strip()) + return + + if elem.name in unwanted_tags: + return + + # Start tag + builder.append(f'<{elem.name}') + + # Add cleaned attributes + attrs = {k: v for k, v in elem.attrs.items() if k not in unwanted_attrs} + for key, value in attrs.items(): + builder.append(f' {key}="{value}"') + + builder.append('>') + + # Process children + for child in elem.children: + render_tag(child) + + # Close tag + builder.append(f'') + + try: + render_tag(tag) + return ''.join(builder) + except Exception: + return str(tag) # Fallback to original if anything fails + +class BM25ContentFilter(RelevantContentFilter): + def __init__(self, user_query: str = None, bm25_threshold: float = 1.0): + super().__init__(user_query=user_query) + self.bm25_threshold = bm25_threshold + self.priority_tags = { + 'h1': 5.0, + 'h2': 4.0, + 'h3': 3.0, + 'title': 4.0, + 'strong': 2.0, + 'b': 1.5, + 'em': 1.5, + 'blockquote': 2.0, + 'code': 2.0, + 'pre': 1.5, + 'th': 1.5, # Table headers + } + + def filter_content(self, html: str) -> List[str]: + """Implements content filtering using BM25 algorithm with priority tag handling""" + if not html or not isinstance(html, str): + return [] + + soup = BeautifulSoup(html, 'lxml') + body = soup.find('body') + query = self.extract_page_query(soup.find('head'), body) + candidates = self.extract_text_chunks(body) + + if not candidates: + return [] + + # Split into priority and regular candidates + priority_candidates = [] + regular_candidates = [] + + for index, chunk, tag_type, tag in candidates: + if tag.name in self.priority_tags: + priority_candidates.append((index, chunk, tag_type, tag)) + else: + regular_candidates.append((index, chunk, tag_type, tag)) + + # Process regular content with BM25 + tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in regular_candidates] + tokenized_query = query.lower().split() + + # Clean from stop words and noise + tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus] + tokenized_query = clean_tokens(tokenized_query) + + bm25 = BM25Okapi(tokenized_corpus) + scores = bm25.get_scores(tokenized_query) + + # Score and boost regular candidates + scored_candidates = [ + (score * self.priority_tags.get(tag.name, 1.0), index, chunk, tag_type, tag) + for score, (index, chunk, tag_type, tag) in zip(scores, regular_candidates) + ] + scored_candidates.sort(key=lambda x: x[0], reverse=True) + + # Process scored candidates + selected_tags = set() + selected_candidates = [] + + # First add all priority candidates + for index, chunk, tag_type, tag in priority_candidates: + tag_id = id(tag) + if tag_id not in selected_tags: + selected_candidates.append((index, chunk, tag)) + selected_tags.add(tag_id) + + # Then add scored regular candidates that meet threshold + for score, index, chunk, tag_type, tag in scored_candidates: + if score < self.bm25_threshold: + continue + tag_id = id(tag) + if tag_id not in selected_tags: + selected_candidates.append((index, chunk, tag)) + selected_tags.add(tag_id) + + if not selected_candidates: + return [] + + # Sort by original document order + selected_candidates.sort(key=lambda x: x[0]) + return [self.clean_element(tag) for _, _, tag in selected_candidates] + diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index a2dbbd96..9c81638c 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -8,7 +8,8 @@ from .config import * from bs4 import element, NavigableString, Comment from urllib.parse import urljoin from requests.exceptions import InvalidSchema -from .content_cleaning_strategy import ContentCleaningStrategy +# from .content_cleaning_strategy import ContentCleaningStrategy +from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter from .utils import ( sanitize_input_encode, @@ -532,8 +533,15 @@ class WebScrapingStrategy(ContentScrapingStrategy): fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content." fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content." if kwargs.get('fit_markdown', False): - cleaner = ContentCleaningStrategy() - fit_html = cleaner.clean(cleaned_html) + # cleaner = ContentCleaningStrategy() + # fit_html = cleaner.clean(cleaned_html) + # fit_markdown = h.handle(fit_html) + content_filter = BM25ContentFilter( + user_query= kwargs.get('fit_markdown_user_query', None), + bm25_threshold= kwargs.get('fit_markdown_bm25_threshold', 1.0) + ) + fit_html = content_filter.filter_content(html) + fit_html = '\n'.join('
    {}
    '.format(s) for s in fit_html) fit_markdown = h.handle(fit_html) cleaned_html = sanitize_html(cleaned_html) diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 4ac06797..cab4c45b 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -1,5 +1,7 @@ from pydantic import BaseModel, HttpUrl -from typing import List, Dict, Optional +from typing import List, Dict, Optional, Callable, Awaitable + + class UrlModel(BaseModel): url: HttpUrl @@ -12,6 +14,7 @@ class CrawlResult(BaseModel): cleaned_html: Optional[str] = None media: Dict[str, List[Dict]] = {} links: Dict[str, List[Dict]] = {} + downloaded_files: Optional[List[str]] = None screenshot: Optional[str] = None markdown: Optional[str] = None fit_markdown: Optional[str] = None @@ -21,4 +24,15 @@ class CrawlResult(BaseModel): error_message: Optional[str] = None session_id: Optional[str] = None response_headers: Optional[dict] = None - status_code: Optional[int] = None \ No newline at end of file + status_code: Optional[int] = None + +class AsyncCrawlResponse(BaseModel): + html: str + response_headers: Dict[str, str] + status_code: int + screenshot: Optional[str] = None + get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None + downloaded_files: Optional[List[str]] = None + + class Config: + arbitrary_types_allowed = True diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index d8bd6992..49483f43 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -1054,3 +1054,58 @@ def is_external_url(url, base_domain): return False return False + +def clean_tokens(tokens: list[str]) -> list[str]: + # Set of tokens to remove + noise = {'ccp', 'up', '↑', '▲', '⬆️', 'a', 'an', 'at', 'by', 'in', 'of', 'on', 'to', 'the'} + + STOP_WORDS = { + 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', + 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', + 'to', 'was', 'were', 'will', 'with', + + # Pronouns + 'i', 'you', 'he', 'she', 'it', 'we', 'they', + 'me', 'him', 'her', 'us', 'them', + 'my', 'your', 'his', 'her', 'its', 'our', 'their', + 'mine', 'yours', 'hers', 'ours', 'theirs', + 'myself', 'yourself', 'himself', 'herself', 'itself', 'ourselves', 'themselves', + + # Common verbs + 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', + 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', + + # Prepositions + 'about', 'above', 'across', 'after', 'against', 'along', 'among', 'around', + 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'between', 'beyond', + 'by', 'down', 'during', 'except', 'for', 'from', 'in', 'inside', 'into', + 'near', 'of', 'off', 'on', 'out', 'outside', 'over', 'past', 'through', + 'to', 'toward', 'under', 'underneath', 'until', 'up', 'upon', 'with', 'within', + + # Conjunctions + 'and', 'but', 'or', 'nor', 'for', 'yet', 'so', + 'although', 'because', 'since', 'unless', + + # Articles + 'a', 'an', 'the', + + # Other common words + 'this', 'that', 'these', 'those', + 'what', 'which', 'who', 'whom', 'whose', + 'when', 'where', 'why', 'how', + 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', + 'can', 'cannot', "can't", 'could', "couldn't", + 'may', 'might', 'must', "mustn't", + 'shall', 'should', "shouldn't", + 'will', "won't", 'would', "wouldn't", + 'not', "n't", 'no', 'nor', 'none' + } + + # Single comprehension, more efficient than multiple passes + return [token for token in tokens + if len(token) > 2 + and token not in noise + and token not in STOP_WORDS + and not token.startswith('↑') + and not token.startswith('▲') + and not token.startswith('⬆')] diff --git a/tests/async/test_async_doanloader.py b/tests/async/test_async_doanloader.py new file mode 100644 index 00000000..4798b4ca --- /dev/null +++ b/tests/async/test_async_doanloader.py @@ -0,0 +1,229 @@ +import os +import sys +import asyncio +import shutil +from typing import List +import tempfile +import time + +# Add the parent directory to the Python path +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) + +from crawl4ai.async_webcrawler import AsyncWebCrawler + +class TestDownloads: + def __init__(self): + self.temp_dir = tempfile.mkdtemp(prefix="crawl4ai_test_") + self.download_dir = os.path.join(self.temp_dir, "downloads") + os.makedirs(self.download_dir, exist_ok=True) + self.results: List[str] = [] + + def cleanup(self): + shutil.rmtree(self.temp_dir) + + def log_result(self, test_name: str, success: bool, message: str = ""): + result = f"{'✅' if success else '❌'} {test_name}: {message}" + self.results.append(result) + print(result) + + async def test_basic_download(self): + """Test basic file download functionality""" + try: + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=self.download_dir, + verbose=True + ) as crawler: + # Python.org downloads page typically has stable download links + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + // Click first download link + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) downloadLink.click(); + """ + ) + + success = result.downloaded_files is not None and len(result.downloaded_files) > 0 + self.log_result( + "Basic Download", + success, + f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded" + ) + except Exception as e: + self.log_result("Basic Download", False, str(e)) + + async def test_persistent_context_download(self): + """Test downloads with persistent context""" + try: + user_data_dir = os.path.join(self.temp_dir, "user_data") + os.makedirs(user_data_dir, exist_ok=True) + + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=self.download_dir, + use_persistent_context=True, + user_data_dir=user_data_dir, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) downloadLink.click(); + """ + ) + + success = result.downloaded_files is not None and len(result.downloaded_files) > 0 + self.log_result( + "Persistent Context Download", + success, + f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded" + ) + except Exception as e: + self.log_result("Persistent Context Download", False, str(e)) + + async def test_multiple_downloads(self): + """Test multiple simultaneous downloads""" + try: + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=self.download_dir, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + // Click multiple download links + const downloadLinks = document.querySelectorAll('a[href$=".exe"]'); + downloadLinks.forEach(link => link.click()); + """ + ) + + success = result.downloaded_files is not None and len(result.downloaded_files) > 1 + self.log_result( + "Multiple Downloads", + success, + f"Downloaded {len(result.downloaded_files or [])} files" if success else "Not enough files downloaded" + ) + except Exception as e: + self.log_result("Multiple Downloads", False, str(e)) + + async def test_different_browsers(self): + """Test downloads across different browser types""" + browsers = ["chromium", "firefox", "webkit"] + + for browser_type in browsers: + try: + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=self.download_dir, + browser_type=browser_type, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) downloadLink.click(); + """ + ) + + success = result.downloaded_files is not None and len(result.downloaded_files) > 0 + self.log_result( + f"{browser_type.title()} Download", + success, + f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded" + ) + except Exception as e: + self.log_result(f"{browser_type.title()} Download", False, str(e)) + + async def test_edge_cases(self): + """Test various edge cases""" + + # Test 1: Downloads without specifying download path + try: + async with AsyncWebCrawler( + accept_downloads=True, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code="document.querySelector('a[href$=\".exe\"]').click()" + ) + self.log_result( + "Default Download Path", + True, + f"Downloaded to default path: {result.downloaded_files[0] if result.downloaded_files else 'None'}" + ) + except Exception as e: + self.log_result("Default Download Path", False, str(e)) + + # Test 2: Downloads with invalid path + try: + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path="/invalid/path/that/doesnt/exist", + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code="document.querySelector('a[href$=\".exe\"]').click()" + ) + self.log_result("Invalid Download Path", False, "Should have raised an error") + except Exception as e: + self.log_result("Invalid Download Path", True, "Correctly handled invalid path") + + # Test 3: Download with accept_downloads=False + try: + async with AsyncWebCrawler( + accept_downloads=False, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code="document.querySelector('a[href$=\".exe\"]').click()" + ) + success = result.downloaded_files is None + self.log_result( + "Disabled Downloads", + success, + "Correctly ignored downloads" if success else "Unexpectedly downloaded files" + ) + except Exception as e: + self.log_result("Disabled Downloads", False, str(e)) + + async def run_all_tests(self): + """Run all test cases""" + print("\n🧪 Running Download Tests...\n") + + test_methods = [ + self.test_basic_download, + self.test_persistent_context_download, + self.test_multiple_downloads, + self.test_different_browsers, + self.test_edge_cases + ] + + for test in test_methods: + print(f"\n📝 Running {test.__doc__}...") + await test() + await asyncio.sleep(2) # Brief pause between tests + + print("\n📊 Test Results Summary:") + for result in self.results: + print(result) + + successes = len([r for r in self.results if '✅' in r]) + total = len(self.results) + print(f"\nTotal: {successes}/{total} tests passed") + + self.cleanup() + +async def main(): + tester = TestDownloads() + await tester.run_all_tests() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file From 7f1ae5adcf8552f9520d93eeec446c6ea7cd57e6 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 14 Nov 2024 22:51:51 +0800 Subject: [PATCH 020/115] Update changelog --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 33d09184..7a00aa2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +# [0.3.74] November 14, 2024 + +- In this commit, the library is updated to process file downloads. Users can now specify a download folder and trigger the download process via JavaScript or other means, with all files being saved. The list of downloaded files will also be added to the crowd result object. +- Another thing this commit introduces is the concept of the Relevance Content Filter. This is an improvement over Fit Markdown. This class of strategies aims to extract the main content from a given page - the part that really matters and is useful to be processed. One strategy has been created using the BM25 algorithm, which finds chunks of text from the web page relevant to its title, descriptions, and keywords, or supports a given user query and matches them. The result is then returned to the main engine to be converted to Markdown. Plans include adding approaches using language models as well. +- The cache database was updated to hold information about response headers and downloaded files. + + # Changelog - November 13, 2024 ### Added From 1f269f98344f08bc3390a4f9ec689787cdf5b59b Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 15 Nov 2024 18:11:11 +0800 Subject: [PATCH 021/115] test(content_filter): add comprehensive tests for BM25ContentFilter functionality --- tests/async/test_content_filter.py | 175 +++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 tests/async/test_content_filter.py diff --git a/tests/async/test_content_filter.py b/tests/async/test_content_filter.py new file mode 100644 index 00000000..a873c414 --- /dev/null +++ b/tests/async/test_content_filter.py @@ -0,0 +1,175 @@ +import os, sys +import pytest +from bs4 import BeautifulSoup +from typing import List + +# Add the parent directory to the Python path +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) + +from crawl4ai.content_filter_strategy import BM25ContentFilter + +@pytest.fixture +def basic_html(): + return """ + + + Test Article + + + + +

    Main Heading

    +
    +

    This is a long paragraph with more than fifty words. It continues with more text to ensure we meet the minimum word count threshold. We need to make sure this paragraph is substantial enough to be considered for extraction according to our filtering rules. This should be enough words now.

    + +
    + + + """ + +@pytest.fixture +def wiki_html(): + return """ + + + Wikipedia Article + + +

    Article Title

    +

    Section 1

    +

    Short but important section header description.

    +
    +

    Long paragraph with sufficient words to meet the minimum threshold. This paragraph continues with more text to ensure we have enough content for proper testing. We need to make sure this has enough words to pass our filters and be considered valid content for extraction purposes.

    +
    + + + """ + +@pytest.fixture +def no_meta_html(): + return """ + + +

    Simple Page

    +

    First paragraph that should be used as fallback for query when no meta tags exist. This text needs to be long enough to serve as a meaningful fallback for our content extraction process.

    + + + """ + +class TestBM25ContentFilter: + def test_basic_extraction(self, basic_html): + """Test basic content extraction functionality""" + filter = BM25ContentFilter() + contents = filter.filter_content(basic_html) + + assert contents, "Should extract content" + assert len(contents) >= 1, "Should extract at least one content block" + assert "long paragraph" in ' '.join(contents).lower() + assert "navigation" not in ' '.join(contents).lower() + + def test_user_query_override(self, basic_html): + """Test that user query overrides metadata extraction""" + user_query = "specific test query" + filter = BM25ContentFilter(user_query=user_query) + + # Access internal state to verify query usage + soup = BeautifulSoup(basic_html, 'lxml') + extracted_query = filter.extract_page_query(soup.find('head')) + + assert extracted_query == user_query + assert "Test description" not in extracted_query + + def test_header_extraction(self, wiki_html): + """Test that headers are properly extracted despite length""" + filter = BM25ContentFilter() + contents = filter.filter_content(wiki_html) + + combined_content = ' '.join(contents).lower() + assert "section 1" in combined_content, "Should include section header" + assert "article title" in combined_content, "Should include main title" + + def test_no_metadata_fallback(self, no_meta_html): + """Test fallback behavior when no metadata is present""" + filter = BM25ContentFilter() + contents = filter.filter_content(no_meta_html) + + assert contents, "Should extract content even without metadata" + assert "First paragraph" in ' '.join(contents), "Should use first paragraph content" + + def test_empty_input(self): + """Test handling of empty input""" + filter = BM25ContentFilter() + assert filter.filter_content("") == [] + assert filter.filter_content(None) == [] + + def test_malformed_html(self): + """Test handling of malformed HTML""" + malformed_html = "

    Unclosed paragraph

    Nested content

    " + filter = BM25ContentFilter() + contents = filter.filter_content(malformed_html) + + assert isinstance(contents, list), "Should return list even with malformed HTML" + + def test_threshold_behavior(self, basic_html): + """Test different BM25 threshold values""" + strict_filter = BM25ContentFilter(bm25_threshold=2.0) + lenient_filter = BM25ContentFilter(bm25_threshold=0.5) + + strict_contents = strict_filter.filter_content(basic_html) + lenient_contents = lenient_filter.filter_content(basic_html) + + assert len(strict_contents) <= len(lenient_contents), \ + "Strict threshold should extract fewer elements" + + def test_html_cleaning(self, basic_html): + """Test HTML cleaning functionality""" + filter = BM25ContentFilter() + contents = filter.filter_content(basic_html) + + cleaned_content = ' '.join(contents) + assert 'class=' not in cleaned_content, "Should remove class attributes" + assert 'style=' not in cleaned_content, "Should remove style attributes" + assert ' +
    {'

    Test content. ' * 1000}

    + + """ + filter = BM25ContentFilter() + contents = filter.filter_content(large_html) + assert contents, "Should handle large content blocks" + + @pytest.mark.parametrize("unwanted_tag", [ + 'script', 'style', 'nav', 'footer', 'header' + ]) + def test_excluded_tags(self, unwanted_tag): + """Test that specific tags are properly excluded""" + html = f""" + + <{unwanted_tag}>Should not appear +

    Should appear

    + + """ + filter = BM25ContentFilter() + contents = filter.filter_content(html) + + combined_content = ' '.join(contents).lower() + assert "should not appear" not in combined_content + + def test_performance(self, basic_html): + """Test performance with timer""" + filter = BM25ContentFilter() + + import time + start = time.perf_counter() + filter.filter_content(basic_html) + duration = time.perf_counter() - start + + assert duration < 1.0, f"Processing took too long: {duration:.2f} seconds" + +if __name__ == "__main__": + pytest.main([__file__]) \ No newline at end of file From ae7ebc0bd82e6d621f0d13a8a22d537f31dff0f6 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 15 Nov 2024 20:16:13 +0800 Subject: [PATCH 022/115] chore: update .gitignore and enhance changelog with major feature additions and examples --- .gitignore | 3 +- CHANGELOG.md | 30 +++++ docs/examples/v0.3.74.overview.py | 195 ++++++++++++++++++++++++++++++ 3 files changed, 227 insertions(+), 1 deletion(-) create mode 100644 docs/examples/v0.3.74.overview.py diff --git a/.gitignore b/.gitignore index aca02959..0acec10f 100644 --- a/.gitignore +++ b/.gitignore @@ -209,4 +209,5 @@ git_issues.md .tests/ .issues/ .docs/ -.issues/ \ No newline at end of file +.issues/ +.gitboss/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a00aa2e..e82fa6a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,35 @@ # Changelog + +## Version 0.3.74, Major Changes + +1. **File Download Processing** (Nov 14, 2024) + - Added capability for users to specify download folders + - Implemented file download tracking in crowd result object + - Created new file: `tests/async/test_async_doanloader.py` + +2. **Content Filtering Improvements** (Nov 14, 2024) + - Introduced Relevance Content Filter as an improvement over Fit Markdown + - Implemented BM25 algorithm for content relevance matching + - Added new file: `crawl4ai/content_filter_strategy.py` + - Removed deprecated: `crawl4ai/content_cleaning_strategy.py` + +3. **Local File and Raw HTML Support** (Nov 13, 2024) + - Added support for processing local files + - Implemented raw HTML input handling in AsyncWebCrawler + - Enhanced `crawl4ai/async_webcrawler.py` with significant performance improvements + +4. **Browser Management Enhancements** (Nov 12, 2024) + - Implemented new async crawler strategy using Playwright + - Introduced ManagedBrowser for better browser session handling + - Added support for persistent browser sessions + - Updated from playwright_stealth to tf-playwright-stealth + +5. **API Server Component** + - Added CORS support + - Implemented static file serving + - Enhanced root redirect functionality + # [0.3.74] November 14, 2024 - In this commit, the library is updated to process file downloads. Users can now specify a download folder and trigger the download process via JavaScript or other means, with all files being saved. The list of downloaded files will also be added to the crowd result object. diff --git a/docs/examples/v0.3.74.overview.py b/docs/examples/v0.3.74.overview.py new file mode 100644 index 00000000..579d05dd --- /dev/null +++ b/docs/examples/v0.3.74.overview.py @@ -0,0 +1,195 @@ +import asyncio +import os +from pathlib import Path +import aiohttp +import json +from crawl4ai import AsyncWebCrawler +from crawl4ai.content_filter_strategy import BM25ContentFilter + +# 1. File Download Processing Example +async def download_example(): + """Example of downloading files from Python.org""" + # downloads_path = os.path.join(os.getcwd(), "downloads") + downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads") + os.makedirs(downloads_path, exist_ok=True) + + print(f"Downloads will be saved to: {downloads_path}") + + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=downloads_path, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + // Find and click the first Windows installer link + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) { + console.log('Found download link:', downloadLink.href); + downloadLink.click(); + } else { + console.log('No .exe download link found'); + } + """, + wait_for=5 # Wait 5 seconds to ensure download starts + ) + + if result.downloaded_files: + print("\nDownload successful!") + print("Downloaded files:") + for file_path in result.downloaded_files: + print(f"- {file_path}") + print(f" File size: {os.path.getsize(file_path) / (1024*1024):.2f} MB") + else: + print("\nNo files were downloaded") + +# 2. Content Filtering with BM25 Example +async def content_filtering_example(): + """Example of using the new BM25 content filtering""" + async with AsyncWebCrawler(verbose=True) as crawler: + # Create filter with custom query for OpenAI's blog + content_filter = BM25ContentFilter( + user_query="AI language models research innovation", + bm25_threshold=1.0 + ) + + result = await crawler.arun( + url="https://openai.com/blog", + extraction_strategy=content_filter + ) + + print(f"Filtered content: {result.extracted_content}") + +# 3. Local File and Raw HTML Processing Example +async def local_and_raw_html_example(): + """Example of processing local files and raw HTML""" + # Create a sample HTML file + sample_file = "sample.html" + with open(sample_file, "w") as f: + f.write(""" + +

    Test Content

    +

    This is a test paragraph.

    + + """) + + async with AsyncWebCrawler(verbose=True) as crawler: + # Process local file + local_result = await crawler.arun( + url=f"file://{os.path.abspath(sample_file)}" + ) + + # Process raw HTML + raw_html = """ + +

    Raw HTML Test

    +

    This is a test of raw HTML processing.

    + + """ + raw_result = await crawler.arun( + url=f"raw:{raw_html}" + ) + + # Clean up + os.remove(sample_file) + + print("Local file content:", local_result.markdown) + print("\nRaw HTML content:", raw_result.markdown) + +# 4. Browser Management Example +async def browser_management_example(): + """Example of using enhanced browser management features""" + # Use the specified user directory path + user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile") + os.makedirs(user_data_dir, exist_ok=True) + + print(f"Browser profile will be saved to: {user_data_dir}") + + async with AsyncWebCrawler( + use_managed_browser=True, + user_data_dir=user_data_dir, + headless=False, + verbose=True + ) as crawler: + # Use GitHub as an example - it's a good test for browser management + # because it requires proper browser handling + result = await crawler.arun( + url="https://github.com/trending", + session_id="persistent_session_1", + js_code=""" + // Custom JavaScript to execute on GitHub's trending page + const repos = document.querySelectorAll('article.Box-row'); + const data = Array.from(repos).map(repo => ({ + name: repo.querySelector('h2')?.textContent?.trim(), + description: repo.querySelector('p')?.textContent?.trim(), + language: repo.querySelector('[itemprop="programmingLanguage"]')?.textContent?.trim() + })); + console.log('Trending repositories:', JSON.stringify(data, null, 2)); + """ + ) + + print("\nBrowser session result:", result.success) + if result.success: + print("Page title:", result.metadata.get('title', 'No title found')) + +# 5. API Usage Example +async def api_example(): + """Example of using the new API endpoints""" + async with aiohttp.ClientSession() as session: + # Submit crawl job + crawl_request = { + "urls": ["https://news.ycombinator.com"], # Hacker News as an example + "extraction_config": { + "type": "json_css", + "params": { + "selectors": { + "titles": ".title a", + "scores": ".score", + "comments": ".comment-tree" + } + } + }, + "crawler_params": { + "headless": True, + "use_managed_browser": True + }, + "screenshot": True, + "magic": True + } + + async with session.post( + "http://localhost:11235/crawl", + json=crawl_request + ) as response: + task_data = await response.json() + task_id = task_data["task_id"] + + # Check task status + async with session.get( + f"http://localhost:11235/task/{task_id}" + ) as status_response: + result = await status_response.json() + print(f"Task result: {result}") + +# Main execution +async def main(): + print("Running Crawl4AI feature examples...") + + print("\n1. Running Download Example:") + await download_example() + + print("\n2. Running Content Filtering Example:") + await content_filtering_example() + + print("\n3. Running Local and Raw HTML Example:") + await local_and_raw_html_example() + + print("\n4. Running Browser Management Example:") + await browser_management_example() + + print("\n5. Running API Example:") + await api_example() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file From d0014c67931a27f3969e257da59aa9b70527b4cf Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 14:54:41 +0800 Subject: [PATCH 023/115] New async database manager and migration support - Introduced AsyncDatabaseManager for async DB management. - Added migration feature to transition to file-based storage. - Enhanced web crawler with improved caching logic. - Updated requirements and setup for async processing. --- crawl4ai/async_database.3.73.py | 285 ++++++++++++++++++++++++++++++++ crawl4ai/async_database.py | 185 +++++++++++++++------ crawl4ai/async_webcrawler.py | 116 +++++++------ crawl4ai/config.py | 4 +- crawl4ai/migrations.py | 152 +++++++++++++++++ crawl4ai/utils.py | 27 +++ requirements.txt | 1 + setup.py | 34 +++- 8 files changed, 685 insertions(+), 119 deletions(-) create mode 100644 crawl4ai/async_database.3.73.py create mode 100644 crawl4ai/migrations.py diff --git a/crawl4ai/async_database.3.73.py b/crawl4ai/async_database.3.73.py new file mode 100644 index 00000000..f86c7f1d --- /dev/null +++ b/crawl4ai/async_database.3.73.py @@ -0,0 +1,285 @@ +import os +from pathlib import Path +import aiosqlite +import asyncio +from typing import Optional, Tuple, Dict +from contextlib import asynccontextmanager +import logging +import json # Added for serialization/deserialization +from .utils import ensure_content_dirs, generate_content_hash +import xxhash +import aiofiles +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +DB_PATH = os.path.join(Path.home(), ".crawl4ai") +os.makedirs(DB_PATH, exist_ok=True) +DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") + +class AsyncDatabaseManager: + def __init__(self, pool_size: int = 10, max_retries: int = 3): + self.db_path = DB_PATH + self.content_paths = ensure_content_dirs(os.path.dirname(DB_PATH)) + self.pool_size = pool_size + self.max_retries = max_retries + self.connection_pool: Dict[int, aiosqlite.Connection] = {} + self.pool_lock = asyncio.Lock() + self.connection_semaphore = asyncio.Semaphore(pool_size) + + async def initialize(self): + """Initialize the database and connection pool""" + await self.ainit_db() + + async def cleanup(self): + """Cleanup connections when shutting down""" + async with self.pool_lock: + for conn in self.connection_pool.values(): + await conn.close() + self.connection_pool.clear() + + @asynccontextmanager + async def get_connection(self): + """Connection pool manager""" + async with self.connection_semaphore: + task_id = id(asyncio.current_task()) + try: + async with self.pool_lock: + if task_id not in self.connection_pool: + conn = await aiosqlite.connect( + self.db_path, + timeout=30.0 + ) + await conn.execute('PRAGMA journal_mode = WAL') + await conn.execute('PRAGMA busy_timeout = 5000') + self.connection_pool[task_id] = conn + + yield self.connection_pool[task_id] + + except Exception as e: + logger.error(f"Connection error: {e}") + raise + finally: + async with self.pool_lock: + if task_id in self.connection_pool: + await self.connection_pool[task_id].close() + del self.connection_pool[task_id] + + async def execute_with_retry(self, operation, *args): + """Execute database operations with retry logic""" + for attempt in range(self.max_retries): + try: + async with self.get_connection() as db: + result = await operation(db, *args) + await db.commit() + return result + except Exception as e: + if attempt == self.max_retries - 1: + logger.error(f"Operation failed after {self.max_retries} attempts: {e}") + raise + await asyncio.sleep(1 * (attempt + 1)) # Exponential backoff + + async def ainit_db(self): + """Initialize database schema""" + async def _init(db): + await db.execute(''' + CREATE TABLE IF NOT EXISTS crawled_data ( + url TEXT PRIMARY KEY, + html TEXT, + cleaned_html TEXT, + markdown TEXT, + extracted_content TEXT, + success BOOLEAN, + media TEXT DEFAULT "{}", + links TEXT DEFAULT "{}", + metadata TEXT DEFAULT "{}", + screenshot TEXT DEFAULT "", + response_headers TEXT DEFAULT "{}", + downloaded_files TEXT DEFAULT "{}" -- New column added + ) + ''') + + await self.execute_with_retry(_init) + await self.update_db_schema() + + async def update_db_schema(self): + """Update database schema if needed""" + async def _check_columns(db): + cursor = await db.execute("PRAGMA table_info(crawled_data)") + columns = await cursor.fetchall() + return [column[1] for column in columns] + + column_names = await self.execute_with_retry(_check_columns) + + # List of new columns to add + new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files'] + + for column in new_columns: + if column not in column_names: + await self.aalter_db_add_column(column) + + async def aalter_db_add_column(self, new_column: str): + """Add new column to the database""" + async def _alter(db): + if new_column == 'response_headers': + await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"') + else: + await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') + logger.info(f"Added column '{new_column}' to the database.") + + await self.execute_with_retry(_alter) + + + async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, bool, str, str, str, str]]: + """Retrieve cached URL data""" + async def _get(db): + async with db.execute( + ''' + SELECT url, html, cleaned_html, markdown, + extracted_content, success, media, links, + metadata, screenshot, response_headers, + downloaded_files + FROM crawled_data WHERE url = ? + ''', + (url,) + ) as cursor: + row = await cursor.fetchone() + if row: + # Load content from files using stored hashes + html = await self._load_content(row[1], 'html') if row[1] else "" + cleaned = await self._load_content(row[2], 'cleaned') if row[2] else "" + markdown = await self._load_content(row[3], 'markdown') if row[3] else "" + extracted = await self._load_content(row[4], 'extracted') if row[4] else "" + screenshot = await self._load_content(row[9], 'screenshots') if row[9] else "" + + return ( + row[0], # url + html or "", # Return empty string if file not found + cleaned or "", + markdown or "", + extracted or "", + row[5], # success + json.loads(row[6] or '{}'), # media + json.loads(row[7] or '{}'), # links + json.loads(row[8] or '{}'), # metadata + screenshot or "", + json.loads(row[10] or '{}'), # response_headers + json.loads(row[11] or '[]') # downloaded_files + ) + return None + + try: + return await self.execute_with_retry(_get) + except Exception as e: + logger.error(f"Error retrieving cached URL: {e}") + return None + + async def acache_url(self, url: str, html: str, cleaned_html: str, + markdown: str, extracted_content: str, success: bool, + media: str = "{}", links: str = "{}", + metadata: str = "{}", screenshot: str = "", + response_headers: str = "{}", downloaded_files: str = "[]"): + """Cache URL data with content stored in filesystem""" + + # Store content files and get hashes + html_hash = await self._store_content(html, 'html') + cleaned_hash = await self._store_content(cleaned_html, 'cleaned') + markdown_hash = await self._store_content(markdown, 'markdown') + extracted_hash = await self._store_content(extracted_content, 'extracted') + screenshot_hash = await self._store_content(screenshot, 'screenshots') + + async def _cache(db): + await db.execute(''' + INSERT INTO crawled_data ( + url, html, cleaned_html, markdown, + extracted_content, success, media, links, metadata, + screenshot, response_headers, downloaded_files + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(url) DO UPDATE SET + html = excluded.html, + cleaned_html = excluded.cleaned_html, + markdown = excluded.markdown, + extracted_content = excluded.extracted_content, + success = excluded.success, + media = excluded.media, + links = excluded.links, + metadata = excluded.metadata, + screenshot = excluded.screenshot, + response_headers = excluded.response_headers, + downloaded_files = excluded.downloaded_files + ''', (url, html_hash, cleaned_hash, markdown_hash, extracted_hash, + success, media, links, metadata, screenshot_hash, + response_headers, downloaded_files)) + + try: + await self.execute_with_retry(_cache) + except Exception as e: + logger.error(f"Error caching URL: {e}") + + + + async def aget_total_count(self) -> int: + """Get total number of cached URLs""" + async def _count(db): + async with db.execute('SELECT COUNT(*) FROM crawled_data') as cursor: + result = await cursor.fetchone() + return result[0] if result else 0 + + try: + return await self.execute_with_retry(_count) + except Exception as e: + logger.error(f"Error getting total count: {e}") + return 0 + + async def aclear_db(self): + """Clear all data from the database""" + async def _clear(db): + await db.execute('DELETE FROM crawled_data') + + try: + await self.execute_with_retry(_clear) + except Exception as e: + logger.error(f"Error clearing database: {e}") + + async def aflush_db(self): + """Drop the entire table""" + async def _flush(db): + await db.execute('DROP TABLE IF EXISTS crawled_data') + + try: + await self.execute_with_retry(_flush) + except Exception as e: + logger.error(f"Error flushing database: {e}") + + + async def _store_content(self, content: str, content_type: str) -> str: + """Store content in filesystem and return hash""" + if not content: + return "" + + content_hash = generate_content_hash(content) + file_path = os.path.join(self.content_paths[content_type], content_hash) + + # Only write if file doesn't exist + if not os.path.exists(file_path): + async with aiofiles.open(file_path, 'w', encoding='utf-8') as f: + await f.write(content) + + return content_hash + + async def _load_content(self, content_hash: str, content_type: str) -> Optional[str]: + """Load content from filesystem by hash""" + if not content_hash: + return None + + file_path = os.path.join(self.content_paths[content_type], content_hash) + try: + async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: + return await f.read() + except: + logger.error(f"Failed to load content: {file_path}") + return None + +# Create a singleton instance +async_db_manager = AsyncDatabaseManager() diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index c52e3db6..f97d8131 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -6,7 +6,11 @@ from typing import Optional, Tuple, Dict from contextlib import asynccontextmanager import logging import json # Added for serialization/deserialization - +from .utils import ensure_content_dirs, generate_content_hash +from .models import CrawlResult +import xxhash +import aiofiles +from .config import NEED_MIGRATION # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -18,6 +22,7 @@ DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") class AsyncDatabaseManager: def __init__(self, pool_size: int = 10, max_retries: int = 3): self.db_path = DB_PATH + self.content_paths = ensure_content_dirs(os.path.dirname(DB_PATH)) self.pool_size = pool_size self.max_retries = max_retries self.connection_pool: Dict[int, aiosqlite.Connection] = {} @@ -26,8 +31,20 @@ class AsyncDatabaseManager: async def initialize(self): """Initialize the database and connection pool""" - await self.ainit_db() - + try: + logger.info("Initializing database...") + await self.ainit_db() + if NEED_MIGRATION: + await self.update_db_schema() + from .migrations import run_migration # Import here to avoid circular imports + await run_migration() + logger.info("Database initialization and migration completed successfully") + else: + logger.info("Database initialization completed successfully") + except Exception as e: + logger.error(f"Database initialization error: {e}") + logger.info("Database will be initialized on first use") + async def cleanup(self): """Cleanup connections when shutting down""" async with self.pool_lock: @@ -97,7 +114,7 @@ class AsyncDatabaseManager: ''') await self.execute_with_retry(_init) - await self.update_db_schema() + async def update_db_schema(self): """Update database schema if needed""" @@ -126,34 +143,59 @@ class AsyncDatabaseManager: await self.execute_with_retry(_alter) - async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, bool, str, str, str, str]]: - """Retrieve cached URL data""" + async def aget_cached_url(self, url: str) -> Optional[CrawlResult]: + """Retrieve cached URL data as CrawlResult""" async def _get(db): async with db.execute( - ''' - SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers, downloaded_files - FROM crawled_data WHERE url = ? - ''', - (url,) + 'SELECT * FROM crawled_data WHERE url = ?', (url,) ) as cursor: row = await cursor.fetchone() - if row: - # Deserialize JSON fields - return ( - row[0], # url - row[1], # html - row[2], # cleaned_html - row[3], # markdown - row[4], # extracted_content - row[5], # success - json.loads(row[6] or '{}'), # media - json.loads(row[7] or '{}'), # links - json.loads(row[8] or '{}'), # metadata - row[9], # screenshot - json.loads(row[10] or '{}'), # response_headers - json.loads(row[11] or '[]') # downloaded_files - ) - return None + if not row: + return None + + # Get column names + columns = [description[0] for description in cursor.description] + # Create dict from row data + row_dict = dict(zip(columns, row)) + + # Load content from files using stored hashes + content_fields = { + 'html': row_dict['html'], + 'cleaned_html': row_dict['cleaned_html'], + 'markdown': row_dict['markdown'], + 'extracted_content': row_dict['extracted_content'], + 'screenshot': row_dict['screenshot'] + } + + for field, hash_value in content_fields.items(): + if hash_value: + content = await self._load_content( + hash_value, + field.split('_')[0] # Get content type from field name + ) + row_dict[field] = content or "" + else: + row_dict[field] = "" + + # Parse JSON fields + json_fields = ['media', 'links', 'metadata', 'response_headers'] + for field in json_fields: + try: + row_dict[field] = json.loads(row_dict[field]) if row_dict[field] else {} + except json.JSONDecodeError: + row_dict[field] = {} + + # Parse downloaded_files + try: + row_dict['downloaded_files'] = json.loads(row_dict['downloaded_files']) if row_dict['downloaded_files'] else [] + except json.JSONDecodeError: + row_dict['downloaded_files'] = [] + + # Remove any fields not in CrawlResult model + valid_fields = CrawlResult.__annotations__.keys() + filtered_dict = {k: v for k, v in row_dict.items() if k in valid_fields} + + return CrawlResult(**filtered_dict) try: return await self.execute_with_retry(_get) @@ -161,26 +203,27 @@ class AsyncDatabaseManager: logger.error(f"Error retrieving cached URL: {e}") return None - async def acache_url( - self, - url: str, - html: str, - cleaned_html: str, - markdown: str, - extracted_content: str, - success: bool, - media: str = "{}", - links: str = "{}", - metadata: str = "{}", - screenshot: str = "", - response_headers: str = "{}", - downloaded_files: str = "[]" - ): - """Cache URL data with retry logic""" + async def acache_url(self, result: CrawlResult): + """Cache CrawlResult data""" + # Store content files and get hashes + content_map = { + 'html': (result.html, 'html'), + 'cleaned_html': (result.cleaned_html or "", 'cleaned'), + 'markdown': (result.markdown or "", 'markdown'), + 'extracted_content': (result.extracted_content or "", 'extracted'), + 'screenshot': (result.screenshot or "", 'screenshots') + } + + content_hashes = {} + for field, (content, content_type) in content_map.items(): + content_hashes[field] = await self._store_content(content, content_type) + async def _cache(db): await db.execute(''' INSERT INTO crawled_data ( - url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers, downloaded_files + url, html, cleaned_html, markdown, + extracted_content, success, media, links, metadata, + screenshot, response_headers, downloaded_files ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(url) DO UPDATE SET @@ -189,13 +232,26 @@ class AsyncDatabaseManager: markdown = excluded.markdown, extracted_content = excluded.extracted_content, success = excluded.success, - media = excluded.media, - links = excluded.links, - metadata = excluded.metadata, + media = excluded.media, + links = excluded.links, + metadata = excluded.metadata, screenshot = excluded.screenshot, - response_headers = excluded.response_headers, -- Update response_headers + response_headers = excluded.response_headers, downloaded_files = excluded.downloaded_files - ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers, downloaded_files)) + ''', ( + result.url, + content_hashes['html'], + content_hashes['cleaned_html'], + content_hashes['markdown'], + content_hashes['extracted_content'], + result.success, + json.dumps(result.media), + json.dumps(result.links), + json.dumps(result.metadata or {}), + content_hashes['screenshot'], + json.dumps(result.response_headers or {}), + json.dumps(result.downloaded_files or []) + )) try: await self.execute_with_retry(_cache) @@ -234,6 +290,35 @@ class AsyncDatabaseManager: await self.execute_with_retry(_flush) except Exception as e: logger.error(f"Error flushing database: {e}") + + + async def _store_content(self, content: str, content_type: str) -> str: + """Store content in filesystem and return hash""" + if not content: + return "" + + content_hash = generate_content_hash(content) + file_path = os.path.join(self.content_paths[content_type], content_hash) + + # Only write if file doesn't exist + if not os.path.exists(file_path): + async with aiofiles.open(file_path, 'w', encoding='utf-8') as f: + await f.write(content) + + return content_hash + + async def _load_content(self, content_hash: str, content_type: str) -> Optional[str]: + """Load content from filesystem by hash""" + if not content_hash: + return None + + file_path = os.path.join(self.content_paths[content_type], content_hash) + try: + async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: + return await f.read() + except: + logger.error(f"Failed to load content: {file_path}") + return None # Create a singleton instance async_db_manager = AsyncDatabaseManager() diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index cec1ace0..febc01d3 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -47,17 +47,17 @@ class AsyncWebCrawler: async def awarmup(self): # Print a message for crawl4ai and its version - print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}") if self.verbose: + print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}") print("[LOG] 🌤️ Warming up the AsyncWebCrawler") # await async_db_manager.ainit_db() - await async_db_manager.initialize() - await self.arun( - url="https://google.com/", - word_count_threshold=5, - bypass_cache=False, - verbose=False, - ) + # # await async_db_manager.initialize() + # await self.arun( + # url="https://google.com/", + # word_count_threshold=5, + # bypass_cache=False, + # verbose=False, + # ) self.ready = True if self.verbose: print("[LOG] 🌞 AsyncWebCrawler is ready to crawl") @@ -73,6 +73,9 @@ class AsyncWebCrawler: screenshot: bool = False, user_agent: str = None, verbose=True, + disable_cache: bool = False, + no_cache_read: bool = False, + no_cache_write: bool = False, **kwargs, ) -> CrawlResult: """ @@ -89,6 +92,11 @@ class AsyncWebCrawler: CrawlResult: The result of the crawling and processing. """ try: + if disable_cache: + bypass_cache = True + no_cache_read = True + no_cache_write = True + extraction_strategy = extraction_strategy or NoExtractionStrategy() extraction_strategy.verbose = verbose if not isinstance(extraction_strategy, ExtractionStrategy): @@ -108,36 +116,39 @@ class AsyncWebCrawler: is_raw_html = url.startswith("raw:") _url = url if not is_raw_html else "Raw HTML" - if is_web_url and not bypass_cache and not self.always_by_pass_cache: - cached = await async_db_manager.aget_cached_url(url) + start_time = time.perf_counter() + cached_result = None + if is_web_url and (not bypass_cache or not no_cache_read) and not self.always_by_pass_cache: + cached_result = await async_db_manager.aget_cached_url(url) - # if not bypass_cache and not self.always_by_pass_cache: - # cached = await async_db_manager.aget_cached_url(url) - - if kwargs.get("warmup", True) and not self.ready: - return None - - if cached: - html = sanitize_input_encode(cached[1]) - extracted_content = sanitize_input_encode(cached[4]) + if cached_result: + html = sanitize_input_encode(cached_result.html) + extracted_content = sanitize_input_encode(cached_result.extracted_content or "") if screenshot: - screenshot_data = cached[9] + screenshot_data = cached_result.screenshot if not screenshot_data: - cached = None + cached_result = None + if verbose: + print( + f"[LOG] 1️⃣ ✅ Page fetched (cache) for {_url}, success: {bool(html)}, time taken: {time.perf_counter() - start_time:.2f} seconds" + ) + if not cached or not html: - t1 = time.time() + t1 = time.perf_counter() + if user_agent: self.crawler_strategy.update_user_agent(user_agent) async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs) html = sanitize_input_encode(async_response.html) screenshot_data = async_response.screenshot - t2 = time.time() + t2 = time.perf_counter() if verbose: print( - f"[LOG] 🚀 Crawling done for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds" + f"[LOG] 1️⃣ ✅ Page fetched (no-cache) for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds" ) + t1 = time.perf_counter() crawl_result = await self.aprocess_html( url=url, html=html, @@ -163,30 +174,19 @@ class AsyncWebCrawler: crawl_result.downloaded_files = async_response.downloaded_files else: crawl_result.status_code = 200 - crawl_result.response_headers = cached[10] - # crawl_result.downloaded_files = cached[11] + crawl_result.response_headers = cached_result.response_headers if cached_result else {} crawl_result.success = bool(html) crawl_result.session_id = kwargs.get("session_id", None) + if verbose: + print( + f"[LOG] 🔥 🚀 Crawling done for {_url}, success: {crawl_result.success}, time taken: {time.perf_counter() - start_time:.2f} seconds" + ) - if not is_raw_html: - if not bool(cached) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: - await async_db_manager.acache_url( - url = url, - html = html, - cleaned_html = crawl_result.cleaned_html, - markdown = crawl_result.markdown, - extracted_content = extracted_content, - success = True, - media = json.dumps(crawl_result.media), - links = json.dumps(crawl_result.links), - metadata = json.dumps(crawl_result.metadata), - screenshot=screenshot, - response_headers=json.dumps(crawl_result.response_headers), - downloaded_files=json.dumps(crawl_result.downloaded_files), - - ) + if not is_raw_html and not no_cache_write: + if not bool(cached_result) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: + await async_db_manager.acache_url(crawl_result) return crawl_result @@ -258,11 +258,11 @@ class AsyncWebCrawler: verbose: bool, **kwargs, ) -> CrawlResult: - t = time.time() + t = time.perf_counter() # Extract content from HTML try: _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" - t1 = time.time() + t1 = time.perf_counter() scrapping_strategy = WebScrapingStrategy() # result = await scrapping_strategy.ascrap( result = scrapping_strategy.scrap( @@ -276,10 +276,6 @@ class AsyncWebCrawler: ), **kwargs, ) - if verbose: - print( - f"[LOG] 🚀 Content extracted for {_url}, success: True, time taken: {time.time() - t1:.2f} seconds" - ) if result is None: raise ValueError(f"Process HTML, Failed to extract content from the website: {url}") @@ -295,13 +291,14 @@ class AsyncWebCrawler: media = result.get("media", []) links = result.get("links", []) metadata = result.get("metadata", {}) + + if verbose: + print( + f"[LOG] 2️⃣ ✅ Scraping done for {_url}, success: True, time taken: {time.perf_counter() - t1:.2f} seconds" + ) - if extracted_content is None and extraction_strategy and chunking_strategy: - if verbose: - print( - f"[LOG] 🔥 Extracting semantic blocks for {_url}, Strategy: {self.__class__.__name__}" - ) - + if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy): + t1 = time.perf_counter() # Check if extraction strategy is type of JsonCssExtractionStrategy if isinstance(extraction_strategy, JsonCssExtractionStrategy) or isinstance(extraction_strategy, JsonCssExtractionStrategy): extraction_strategy.verbose = verbose @@ -311,11 +308,10 @@ class AsyncWebCrawler: sections = chunking_strategy.chunk(markdown) extracted_content = extraction_strategy.run(url, sections) extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) - - if verbose: - print( - f"[LOG] 🚀 Extraction done for {_url}, time taken: {time.time() - t:.2f} seconds." - ) + if verbose: + print( + f"[LOG] 3️⃣ ✅ Extraction done for {_url}, time taken: {time.perf_counter() - t1:.2f} seconds" + ) screenshot = None if not screenshot else screenshot diff --git a/crawl4ai/config.py b/crawl4ai/config.py index 16638b6d..5bc284bf 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -52,4 +52,6 @@ SOCIAL_MEDIA_DOMAINS = [ # If image is in the first half of the total images extracted from the page IMAGE_SCORE_THRESHOLD = 2 -MAX_METRICS_HISTORY = 1000 \ No newline at end of file +MAX_METRICS_HISTORY = 1000 + +NEED_MIGRATION = True \ No newline at end of file diff --git a/crawl4ai/migrations.py b/crawl4ai/migrations.py new file mode 100644 index 00000000..77616086 --- /dev/null +++ b/crawl4ai/migrations.py @@ -0,0 +1,152 @@ +import os +import asyncio +import logging +from pathlib import Path +import aiosqlite +from typing import Optional +import xxhash +import aiofiles +import shutil +import time +from datetime import datetime + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class DatabaseMigration: + def __init__(self, db_path: str): + self.db_path = db_path + self.content_paths = self._ensure_content_dirs(os.path.dirname(db_path)) + + def _ensure_content_dirs(self, base_path: str) -> dict: + dirs = { + 'html': 'html_content', + 'cleaned': 'cleaned_html', + 'markdown': 'markdown_content', + 'extracted': 'extracted_content', + 'screenshots': 'screenshots' + } + content_paths = {} + for key, dirname in dirs.items(): + path = os.path.join(base_path, dirname) + os.makedirs(path, exist_ok=True) + content_paths[key] = path + return content_paths + + def _generate_content_hash(self, content: str) -> str: + x = xxhash.xxh64() + x.update(content.encode()) + content_hash = x.hexdigest() + return content_hash + # return hashlib.sha256(content.encode()).hexdigest() + + async def _store_content(self, content: str, content_type: str) -> str: + if not content: + return "" + + content_hash = self._generate_content_hash(content) + file_path = os.path.join(self.content_paths[content_type], content_hash) + + if not os.path.exists(file_path): + async with aiofiles.open(file_path, 'w', encoding='utf-8') as f: + await f.write(content) + + return content_hash + + async def migrate_database(self): + """Migrate existing database to file-based storage""" + logger.info("Starting database migration...") + + try: + async with aiosqlite.connect(self.db_path) as db: + # Get all rows + async with db.execute( + '''SELECT url, html, cleaned_html, markdown, + extracted_content, screenshot FROM crawled_data''' + ) as cursor: + rows = await cursor.fetchall() + + migrated_count = 0 + for row in rows: + url, html, cleaned_html, markdown, extracted_content, screenshot = row + + # Store content in files and get hashes + html_hash = await self._store_content(html, 'html') + cleaned_hash = await self._store_content(cleaned_html, 'cleaned') + markdown_hash = await self._store_content(markdown, 'markdown') + extracted_hash = await self._store_content(extracted_content, 'extracted') + screenshot_hash = await self._store_content(screenshot, 'screenshots') + + # Update database with hashes + await db.execute(''' + UPDATE crawled_data + SET html = ?, + cleaned_html = ?, + markdown = ?, + extracted_content = ?, + screenshot = ? + WHERE url = ? + ''', (html_hash, cleaned_hash, markdown_hash, + extracted_hash, screenshot_hash, url)) + + migrated_count += 1 + if migrated_count % 100 == 0: + logger.info(f"Migrated {migrated_count} records...") + + await db.commit() + logger.info(f"Migration completed. {migrated_count} records processed.") + + except Exception as e: + logger.error(f"Migration failed: {e}") + raise + +async def backup_database(db_path: str) -> str: + """Create backup of existing database""" + if not os.path.exists(db_path): + logger.info("No existing database found. Skipping backup.") + return None + + # Create backup with timestamp + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + backup_path = f"{db_path}.backup_{timestamp}" + + try: + # Wait for any potential write operations to finish + await asyncio.sleep(1) + + # Create backup + shutil.copy2(db_path, backup_path) + logger.info(f"Database backup created at: {backup_path}") + return backup_path + except Exception as e: + logger.error(f"Backup failed: {e}") + raise + +async def run_migration(db_path: Optional[str] = None): + """Run database migration""" + if db_path is None: + db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db") + + if not os.path.exists(db_path): + logger.info("No existing database found. Skipping migration.") + return + + # Create backup first + backup_path = await backup_database(db_path) + if not backup_path: + return + + migration = DatabaseMigration(db_path) + await migration.migrate_database() + +def main(): + """CLI entry point for migration""" + import argparse + parser = argparse.ArgumentParser(description='Migrate Crawl4AI database to file-based storage') + parser.add_argument('--db-path', help='Custom database path') + args = parser.parse_args() + + asyncio.run(run_migration(args.db_path)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 49483f43..a80cf09a 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -14,6 +14,9 @@ from typing import Dict, Any from urllib.parse import urljoin import requests from requests.exceptions import InvalidSchema +import hashlib +from typing import Optional, Tuple, Dict, Any +import xxhash class InvalidCSSSelectorError(Exception): pass @@ -1109,3 +1112,27 @@ def clean_tokens(tokens: list[str]) -> list[str]: and not token.startswith('↑') and not token.startswith('▲') and not token.startswith('⬆')] + + +def generate_content_hash(content: str) -> str: + """Generate a unique hash for content""" + return xxhash.xxh64(content.encode()).hexdigest() + # return hashlib.sha256(content.encode()).hexdigest() + +def ensure_content_dirs(base_path: str) -> Dict[str, str]: + """Create content directories if they don't exist""" + dirs = { + 'html': 'html_content', + 'cleaned': 'cleaned_html', + 'markdown': 'markdown_content', + 'extracted': 'extracted_content', + 'screenshots': 'screenshots' + } + + content_paths = {} + for key, dirname in dirs.items(): + path = os.path.join(base_path, dirname) + os.makedirs(path, exist_ok=True) + content_paths[key] = path + + return content_paths \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e83643b3..94f741ca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ python-dotenv~=1.0 requests~=2.26 beautifulsoup4~=4.12 tf-playwright-stealth~=1.0 +xxhash~=3.4 diff --git a/setup.py b/setup.py index 93190291..d3145ac1 100644 --- a/setup.py +++ b/setup.py @@ -5,34 +5,37 @@ from pathlib import Path import shutil import subprocess import sys +import asyncio -# Create the .crawl4ai folder in the user's home directory if it doesn't exist -# If the folder already exists, remove the cache folder +# Create the .crawl4ai folder structure crawl4ai_folder = Path.home() / ".crawl4ai" cache_folder = crawl4ai_folder / "cache" +content_folders = ['html_content', 'cleaned_html', 'markdown_content', + 'extracted_content', 'screenshots'] +# Clean up old cache if exists if cache_folder.exists(): shutil.rmtree(cache_folder) +# Create new folder structure crawl4ai_folder.mkdir(exist_ok=True) cache_folder.mkdir(exist_ok=True) +for folder in content_folders: + (crawl4ai_folder / folder).mkdir(exist_ok=True) -# Read the requirements from requirements.txt +# Read requirements and version __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) with open(os.path.join(__location__, "requirements.txt")) as f: requirements = f.read().splitlines() -# Read version from __init__.py with open("crawl4ai/_version.py") as f: for line in f: if line.startswith("__version__"): version = line.split("=")[1].strip().strip('"') break -# Define the requirements for different environments +# Define requirements default_requirements = requirements -# torch_requirements = ["torch", "nltk", "spacy", "scikit-learn"] -# transformer_requirements = ["transformers", "tokenizers", "onnxruntime"] torch_requirements = ["torch", "nltk", "scikit-learn"] transformer_requirements = ["transformers", "tokenizers"] cosine_similarity_requirements = ["torch", "transformers", "nltk" ] @@ -50,10 +53,24 @@ def install_playwright(): print(f"Unexpected error during Playwright installation: {e}") print("Please run 'python -m playwright install' manually after the installation.") +def run_migration(): + """Initialize database during installation""" + try: + print("Starting database initialization...") + from crawl4ai.async_database import async_db_manager + asyncio.run(async_db_manager.initialize()) + print("Database initialization completed successfully.") + except ImportError: + print("Warning: Database module not found. Will initialize on first use.") + except Exception as e: + print(f"Warning: Database initialization failed: {e}") + print("Database will be initialized on first use") + class PostInstallCommand(install): def run(self): install.run(self) install_playwright() + run_migration() setup( name="Crawl4AI", @@ -66,7 +83,7 @@ setup( author_email="unclecode@kidocode.com", license="MIT", packages=find_packages(), - install_requires=default_requirements + ["playwright"], # Add playwright to default requirements + install_requires=default_requirements + ["playwright", "aiofiles"], # Added aiofiles extras_require={ "torch": torch_requirements, "transformer": transformer_requirements, @@ -77,6 +94,7 @@ setup( entry_points={ 'console_scripts': [ 'crawl4ai-download-models=crawl4ai.model_loader:main', + 'crawl4ai-migrate=crawl4ai.migrations:main', # Added migration command ], }, classifiers=[ From 509844208617673ee4cd066a4386a6c76fdadf91 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 15:30:24 +0800 Subject: [PATCH 024/115] refactor: migrate versioning to __version__.py and remove deprecated _version.py --- crawl4ai/__init__.py | 2 +- crawl4ai/{_version.py => __version__.py} | 0 crawl4ai/async_webcrawler.py | 2 +- crawl4ai/content_filter_strategy.py | 5 ----- middlewares.py | 0 requirements-dev.txt | 5 ----- requirements.txt | 2 ++ setup.py | 2 +- 8 files changed, 5 insertions(+), 13 deletions(-) rename crawl4ai/{_version.py => __version__.py} (100%) delete mode 100644 middlewares.py delete mode 100644 requirements-dev.txt diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 1bcc491c..e55aaf73 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -2,7 +2,7 @@ from .async_webcrawler import AsyncWebCrawler from .models import CrawlResult -from ._version import __version__ +from .__version__ import __version__ # __version__ = "0.3.73" __all__ = [ diff --git a/crawl4ai/_version.py b/crawl4ai/__version__.py similarity index 100% rename from crawl4ai/_version.py rename to crawl4ai/__version__.py diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index febc01d3..03e7a393 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -16,7 +16,7 @@ from .utils import ( InvalidCSSSelectorError, format_html ) -from ._version import __version__ as crawl4ai_version +from .__version__ import __version__ as crawl4ai_version class AsyncWebCrawler: def __init__( diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py index 850ebf11..88375da9 100644 --- a/crawl4ai/content_filter_strategy.py +++ b/crawl4ai/content_filter_strategy.py @@ -1,12 +1,7 @@ -import os import re -import time from bs4 import BeautifulSoup, Tag from typing import List, Tuple, Dict from rank_bm25 import BM25Okapi -import nltk -from time import perf_counter -from html5lib import parse, treebuilders from time import perf_counter from collections import deque from bs4 import BeautifulSoup, NavigableString, Tag diff --git a/middlewares.py b/middlewares.py deleted file mode 100644 index e69de29b..00000000 diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index 7bc121a4..00000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,5 +0,0 @@ --r requirements.txt -pytest -pytest-asyncio -selenium -setuptools diff --git a/requirements.txt b/requirements.txt index 94f741ca..74e8b3d6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,5 @@ requests~=2.26 beautifulsoup4~=4.12 tf-playwright-stealth~=1.0 xxhash~=3.4 +rank-bm25~=0.2 +aiofiles~=24.0 \ No newline at end of file diff --git a/setup.py b/setup.py index d3145ac1..d8ad2cd3 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file with open(os.path.join(__location__, "requirements.txt")) as f: requirements = f.read().splitlines() -with open("crawl4ai/_version.py") as f: +with open("crawl4ai/__version__.py") as f: for line in f: if line.startswith("__version__"): version = line.split("=")[1].strip().strip('"') From 90df6921b7be573d95795907fcdebd28002dfd9b Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 15:34:30 +0800 Subject: [PATCH 025/115] feat(crawl_sync): add synchronous crawl endpoint and corresponding test --- docs/examples/docker_example.py | 21 +++++++++++++++++++++ main.py | 24 ++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py index c22acd55..502f1e52 100644 --- a/docs/examples/docker_example.py +++ b/docs/examples/docker_example.py @@ -33,6 +33,13 @@ class Crawl4AiTester: return status time.sleep(2) + + def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]: + response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, timeout=60) + if response.status_code == 408: + raise TimeoutError("Task did not complete within server timeout") + response.raise_for_status() + return response.json() def test_docker_deployment(version="basic"): tester = Crawl4AiTester() @@ -54,6 +61,7 @@ def test_docker_deployment(version="basic"): # Test cases based on version test_basic_crawl(tester) + test_basic_crawl_sync(tester) # if version in ["full", "transformer"]: # test_cosine_extraction(tester) @@ -78,6 +86,19 @@ def test_basic_crawl(tester: Crawl4AiTester): assert result["result"]["success"] assert len(result["result"]["markdown"]) > 0 +def test_basic_crawl_sync(tester: Crawl4AiTester): + print("\n=== Testing Basic Crawl (Sync) ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 10 + } + + result = tester.submit_sync(request) + print(f"Basic crawl result length: {len(result['result']['markdown'])}") + assert result['status'] == 'completed' + assert result['result']['success'] + assert len(result['result']['markdown']) > 0 + def test_js_execution(tester: Crawl4AiTester): print("\n=== Testing JS Execution ===") request = { diff --git a/main.py b/main.py index a5da029c..660c3366 100644 --- a/main.py +++ b/main.py @@ -375,6 +375,30 @@ async def get_task_status(task_id: str): return response +@app.post("/crawl_sync") +async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]: + task_id = await crawler_service.submit_task(request) + + # Wait up to 60 seconds for task completion + for _ in range(60): + task_info = crawler_service.task_manager.get_task(task_id) + if not task_info: + raise HTTPException(status_code=404, detail="Task not found") + + if task_info.status == TaskStatus.COMPLETED: + # Return same format as /task/{task_id} endpoint + if isinstance(task_info.result, list): + return {"status": task_info.status, "results": [result.dict() for result in task_info.result]} + return {"status": task_info.status, "result": task_info.result.dict()} + + if task_info.status == TaskStatus.FAILED: + raise HTTPException(status_code=500, detail=task_info.error) + + await asyncio.sleep(1) + + # If we get here, task didn't complete within timeout + raise HTTPException(status_code=408, detail="Task timed out") + @app.get("/health") async def health_check(): available_slots = await crawler_service.resource_monitor.get_available_slots() From e62c80729559457c937b9740cb3bab960e6103d3 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 16:38:13 +0800 Subject: [PATCH 026/115] feat(deploy): add Railway deployment configuration and setup instructions --- deploy/railway/README.md | 19 +++++++++++++++++++ deploy/railway/button.json | 33 +++++++++++++++++++++++++++++++++ deploy/railway/railway.toml | 18 ++++++++++++++++++ 3 files changed, 70 insertions(+) create mode 100644 deploy/railway/README.md create mode 100644 deploy/railway/button.json create mode 100644 deploy/railway/railway.toml diff --git a/deploy/railway/README.md b/deploy/railway/README.md new file mode 100644 index 00000000..155e7642 --- /dev/null +++ b/deploy/railway/README.md @@ -0,0 +1,19 @@ +# Railway Deployment + +## Quick Deploy +[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/template/crawl4ai) + +## Manual Setup +1. Fork this repository +2. Create a new Railway project +3. Configure environment variables: + - `INSTALL_TYPE`: basic or all + - `ENABLE_GPU`: true/false +4. Deploy! + +## Configuration +See `railway.toml` for: +- Memory limits +- Health checks +- Restart policies +- Scaling options \ No newline at end of file diff --git a/deploy/railway/button.json b/deploy/railway/button.json new file mode 100644 index 00000000..1fc52167 --- /dev/null +++ b/deploy/railway/button.json @@ -0,0 +1,33 @@ +{ + "name": "Crawl4AI", + "description": "LLM Friendly Web Crawler & Scraper", + "render": { + "dockerfile": { + "path": "Dockerfile" + } + }, + "env": [ + { + "key": "INSTALL_TYPE", + "description": "Installation type (basic/all)", + "default": "basic", + "required": true + }, + { + "key": "ENABLE_GPU", + "description": "Enable GPU support", + "default": "false", + "required": false + } + ], + "services": [ + { + "name": "web", + "dockerfile": "./Dockerfile", + "healthcheck": { + "path": "/health", + "port": 11235 + } + } + ] + } \ No newline at end of file diff --git a/deploy/railway/railway.toml b/deploy/railway/railway.toml new file mode 100644 index 00000000..f24d8fab --- /dev/null +++ b/deploy/railway/railway.toml @@ -0,0 +1,18 @@ +# railway.toml +[build] +builder = "DOCKERFILE" +dockerfilePath = "Dockerfile" + +[deploy] +startCommand = "uvicorn main:app --host 0.0.0.0 --port $PORT" +healthcheckPath = "/health" +restartPolicyType = "ON_FAILURE" +restartPolicyMaxRetries = 3 + +[deploy.memory] +soft = 2048 # 2GB min for Playwright +hard = 4096 # 4GB max + +[deploy.scaling] +min = 1 +max = 1 From f77f06a3bd4c1ef6e45b69a64959b55164bf4512 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 16:43:31 +0800 Subject: [PATCH 027/115] feat(deploy): add deployment configuration and templates for crawl4ai --- .do/app.yaml | 19 +++++++++++++++++++ .do/deploy.template.yaml | 22 ++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 .do/app.yaml create mode 100644 .do/deploy.template.yaml diff --git a/.do/app.yaml b/.do/app.yaml new file mode 100644 index 00000000..7e11aab7 --- /dev/null +++ b/.do/app.yaml @@ -0,0 +1,19 @@ +alerts: +- rule: DEPLOYMENT_FAILED +- rule: DOMAIN_FAILED +name: crawl4ai +region: nyc +services: +- dockerfile_path: Dockerfile + github: + branch: main + deploy_on_push: true + repo: unclecode/crawl4ai + health_check: + http_path: /health + http_port: 11235 + instance_count: 1 + instance_size_slug: basic-xs + name: web + routes: + - path: / \ No newline at end of file diff --git a/.do/deploy.template.yaml b/.do/deploy.template.yaml new file mode 100644 index 00000000..ab76795d --- /dev/null +++ b/.do/deploy.template.yaml @@ -0,0 +1,22 @@ +spec: + name: crawl4ai + services: + - name: crawl4ai + git: + branch: main + repo_clone_url: https://github.com/unclecode/crawl4ai.git + dockerfile_path: Dockerfile + http_port: 11235 + instance_count: 1 + instance_size_slug: basic-xs + health_check: + http_path: /health + envs: + - key: INSTALL_TYPE + value: "basic" + - key: PYTHON_VERSION + value: "3.10" + - key: ENABLE_GPU + value: "false" + routes: + - path: / \ No newline at end of file From fca1319b7d1c3e3da5b07898d3890bced4a7719e Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 17:10:30 +0800 Subject: [PATCH 028/115] feat(docker): add MkDocs installation and build step for documentation --- Dockerfile | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 125fb9b8..54ac641c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -107,13 +107,19 @@ RUN if [ "$INSTALL_TYPE" = "all" ] ; then \ pip install -e "." ; \ fi + # Install MkDocs and required plugins +RUN pip install --no-cache-dir \ + mkdocs \ + mkdocs-material \ + mkdocs-terminal \ + pymdown-extensions + +# Build MkDocs documentation +RUN mkdocs build + # Install Playwright and browsers RUN playwright install -# Health check -HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \ - CMD curl -f http://localhost:8000/health || exit 1 - # Expose port EXPOSE 8000 11235 9222 8080 From 6f2fe5954f6ce9f7f17fb15802054cd6c5802123 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 17:12:41 +0800 Subject: [PATCH 029/115] feat(deploy): update instance size to professional-xs and add memory utilization alert --- .do/deploy.template.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.do/deploy.template.yaml b/.do/deploy.template.yaml index ab76795d..c7db5e7e 100644 --- a/.do/deploy.template.yaml +++ b/.do/deploy.template.yaml @@ -8,7 +8,7 @@ spec: dockerfile_path: Dockerfile http_port: 11235 instance_count: 1 - instance_size_slug: basic-xs + instance_size_slug: professional-xs # 4GB RAM, 2 vCPUs health_check: http_path: /health envs: @@ -19,4 +19,7 @@ spec: - key: ENABLE_GPU value: "false" routes: - - path: / \ No newline at end of file + - path: / + alerts: + - rule: MEM_UTILIZATION + value: 90 # Alert at 90% memory usage \ No newline at end of file From 6b569cceb5332ea481190a86086fbf934c7c89e7 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 17:21:45 +0800 Subject: [PATCH 030/115] feat(deploy): update branch to 0.3.74 and change instance size to basic-xs --- .do/app.yaml | 2 +- .do/deploy.template.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.do/app.yaml b/.do/app.yaml index 7e11aab7..bff8ff97 100644 --- a/.do/app.yaml +++ b/.do/app.yaml @@ -6,7 +6,7 @@ region: nyc services: - dockerfile_path: Dockerfile github: - branch: main + branch: 0.3.74 deploy_on_push: true repo: unclecode/crawl4ai health_check: diff --git a/.do/deploy.template.yaml b/.do/deploy.template.yaml index c7db5e7e..73ee7c5c 100644 --- a/.do/deploy.template.yaml +++ b/.do/deploy.template.yaml @@ -3,12 +3,12 @@ spec: services: - name: crawl4ai git: - branch: main + branch: 0.3.74 repo_clone_url: https://github.com/unclecode/crawl4ai.git dockerfile_path: Dockerfile http_port: 11235 instance_count: 1 - instance_size_slug: professional-xs # 4GB RAM, 2 vCPUs + instance_size_slug: basic-xs # 4GB RAM, 2 vCPUs health_check: http_path: /health envs: From 67edc2d641a672e9fa5a95fa4341407b9e574851 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 17:23:32 +0800 Subject: [PATCH 031/115] feat(deploy): update instance size to professional-xs and add memory utilization alert parameters --- .do/deploy.template.yaml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.do/deploy.template.yaml b/.do/deploy.template.yaml index 73ee7c5c..49d0012b 100644 --- a/.do/deploy.template.yaml +++ b/.do/deploy.template.yaml @@ -8,7 +8,7 @@ spec: dockerfile_path: Dockerfile http_port: 11235 instance_count: 1 - instance_size_slug: basic-xs # 4GB RAM, 2 vCPUs + instance_size_slug: professional-xs health_check: http_path: /health envs: @@ -22,4 +22,9 @@ spec: - path: / alerts: - rule: MEM_UTILIZATION - value: 90 # Alert at 90% memory usage \ No newline at end of file + value: 90 + window: 5m # Added window parameter + operator: GREATER_THAN + disabled: false + - rule: DEPLOYMENT_FAILED + - rule: DOMAIN_FAILED \ No newline at end of file From 5d0b13294cfec45c55b07a5593726335c79b6cde Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 17:25:07 +0800 Subject: [PATCH 032/115] feat(deploy): change instance size to professional-xs and update memory utilization alert window to 300 seconds --- .do/app.yaml | 2 +- .do/deploy.template.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.do/app.yaml b/.do/app.yaml index bff8ff97..00d7b781 100644 --- a/.do/app.yaml +++ b/.do/app.yaml @@ -13,7 +13,7 @@ services: http_path: /health http_port: 11235 instance_count: 1 - instance_size_slug: basic-xs + instance_size_slug: professional-xs name: web routes: - path: / \ No newline at end of file diff --git a/.do/deploy.template.yaml b/.do/deploy.template.yaml index 49d0012b..dcd9b2d7 100644 --- a/.do/deploy.template.yaml +++ b/.do/deploy.template.yaml @@ -23,7 +23,7 @@ spec: alerts: - rule: MEM_UTILIZATION value: 90 - window: 5m # Added window parameter + window: 300 # Changed from "5m" to 300 (5 minutes in seconds) operator: GREATER_THAN disabled: false - rule: DEPLOYMENT_FAILED From 79feab89c4236e7de180ec4cd2257df3f5f3e386 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 17:28:42 +0800 Subject: [PATCH 033/115] refactor(deploy): remove memory utilization alert configuration from deployment template --- .do/deploy.template.yaml | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/.do/deploy.template.yaml b/.do/deploy.template.yaml index dcd9b2d7..9a06a366 100644 --- a/.do/deploy.template.yaml +++ b/.do/deploy.template.yaml @@ -19,12 +19,4 @@ spec: - key: ENABLE_GPU value: "false" routes: - - path: / - alerts: - - rule: MEM_UTILIZATION - value: 90 - window: 300 # Changed from "5m" to 300 (5 minutes in seconds) - operator: GREATER_THAN - disabled: false - - rule: DEPLOYMENT_FAILED - - rule: DOMAIN_FAILED \ No newline at end of file + - path: / \ No newline at end of file From 1961adb530baf74dfec16a0f2da795946855459a Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 17:35:27 +0800 Subject: [PATCH 034/115] refactor(docker): remove shared memory size configuration to streamline Dockerfile --- Dockerfile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 54ac641c..ba29faf1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -123,10 +123,5 @@ RUN playwright install # Expose port EXPOSE 8000 11235 9222 8080 -# Optional: Increase shared memory size to prevent browser crashes -# when loading heavy pages -RUN mkdir /dev/shm -VOLUME /dev/shm - # Start the FastAPI server CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"] \ No newline at end of file From 6360d0545ac2812687a1a9a31de95fa64f600ed4 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 18:08:56 +0800 Subject: [PATCH 035/115] feat(api): add API token authentication and update Dockerfile description --- Dockerfile | 3 ++- docker-compose.yml | 33 +++++++++++++++++++++++++++++++++ docs/examples/docker_example.py | 15 ++++++++++----- main.py | 23 ++++++++++++++++++++--- 4 files changed, 65 insertions(+), 9 deletions(-) create mode 100644 docker-compose.yml diff --git a/Dockerfile b/Dockerfile index ba29faf1..76b4e1cf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ ARG ENABLE_GPU=false # Platform-specific labels LABEL maintainer="unclecode" -LABEL description="Crawl4AI - Advanced Web Crawler with AI capabilities" +LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" LABEL version="1.0" # Environment setup @@ -79,6 +79,7 @@ COPY . . RUN pip install --no-cache-dir -r requirements.txt # Install required library for FastAPI +RUN pip install . RUN pip install fastapi uvicorn psutil # Install ML dependencies first for better layer caching diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..ef0dc9e4 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,33 @@ +version: '3.8' + +services: + crawl4ai: + build: + context: . + dockerfile: Dockerfile + args: + PYTHON_VERSION: 3.10 + INSTALL_TYPE: all + ENABLE_GPU: false + ports: + - "11235:11235" # FastAPI server + - "8000:8000" # Alternative port + - "9222:9222" # Browser debugging + - "8080:8080" # Additional port + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token + volumes: + - /dev/shm:/dev/shm # Shared memory for browser operations + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 1G + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:11235/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s \ No newline at end of file diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py index 502f1e52..6701f6ac 100644 --- a/docs/examples/docker_example.py +++ b/docs/examples/docker_example.py @@ -7,12 +7,14 @@ import os from typing import Dict, Any class Crawl4AiTester: - def __init__(self, base_url: str = "http://localhost:11235"): + def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None): self.base_url = base_url + self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') # Check environment variable as fallback + self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {} def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]: # Submit crawl job - response = requests.post(f"{self.base_url}/crawl", json=request_data) + response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers) task_id = response.json()["task_id"] print(f"Task ID: {task_id}") @@ -22,7 +24,7 @@ class Crawl4AiTester: if time.time() - start_time > timeout: raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds") - result = requests.get(f"{self.base_url}/task/{task_id}") + result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers) status = result.json() if status["status"] == "failed": @@ -35,14 +37,17 @@ class Crawl4AiTester: time.sleep(2) def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]: - response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, timeout=60) + response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60) if response.status_code == 408: raise TimeoutError("Task did not complete within server timeout") response.raise_for_status() return response.json() def test_docker_deployment(version="basic"): - tester = Crawl4AiTester() + tester = Crawl4AiTester( + # base_url="http://localhost:11235" + base_url="https://crawl4ai-sby74.ondigitalocean.app" + ) print(f"Testing Crawl4AI Docker {version} version") # Health check with timeout and retry diff --git a/main.py b/main.py index 660c3366..92b1793b 100644 --- a/main.py +++ b/main.py @@ -10,6 +10,8 @@ from fastapi.exceptions import RequestValidationError from starlette.middleware.base import BaseHTTPMiddleware from starlette.responses import FileResponse from fastapi.responses import RedirectResponse +from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials +from fastapi import Depends, Security from pydantic import BaseModel, HttpUrl, Field from typing import Optional, List, Dict, Any, Union @@ -322,6 +324,21 @@ app.add_middleware( # Mount the pages directory as a static directory app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages") +# API token security +security = HTTPBearer() +CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") + +async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)): + if not CRAWL4AI_API_TOKEN: + return credentials # No token verification if CRAWL4AI_API_TOKEN is not set + if credentials.credentials != CRAWL4AI_API_TOKEN: + raise HTTPException(status_code=401, detail="Invalid token") + return credentials + +# Helper function to conditionally apply security +def secure_endpoint(): + return Depends(verify_token) if CRAWL4AI_API_TOKEN else None + # Check if site directory exists if os.path.exists(__location__ + "/site"): # Mount the site directory as a static directory @@ -348,12 +365,12 @@ def read_root(): return {"message": "Crawl4AI API service is running"} -@app.post("/crawl") +@app.post("/crawl", dependencies=[Depends(verify_token)]) async def crawl(request: CrawlRequest) -> Dict[str, str]: task_id = await crawler_service.submit_task(request) return {"task_id": task_id} -@app.get("/task/{task_id}") +@app.get("/task/{task_id}", dependencies=[Depends(verify_token)]) async def get_task_status(task_id: str): task_info = crawler_service.task_manager.get_task(task_id) if not task_info: @@ -375,7 +392,7 @@ async def get_task_status(task_id: str): return response -@app.post("/crawl_sync") +@app.post("/crawl_sync", dependencies=[Depends(verify_token)]) async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]: task_id = await crawler_service.submit_task(request) From 9139ef3125b8a0bc96e2b26f3a06b09ecc60c020 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 18:19:44 +0800 Subject: [PATCH 036/115] feat(docker): update Dockerfile for improved installation process and enhance deployment documentation with Docker Compose setup and API token security --- Dockerfile | 9 +- docs/md_v2/basic/docker-deploymeny.md | 123 +++++++++++++++++--------- 2 files changed, 86 insertions(+), 46 deletions(-) diff --git a/Dockerfile b/Dockerfile index 76b4e1cf..aac2280a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -79,7 +79,6 @@ COPY . . RUN pip install --no-cache-dir -r requirements.txt # Install required library for FastAPI -RUN pip install . RUN pip install fastapi uvicorn psutil # Install ML dependencies first for better layer caching @@ -97,15 +96,15 @@ RUN if [ "$INSTALL_TYPE" = "all" ] ; then \ # Install the package RUN if [ "$INSTALL_TYPE" = "all" ] ; then \ - pip install -e ".[all]" && \ + pip install ".[all]" && \ python -m crawl4ai.model_loader ; \ elif [ "$INSTALL_TYPE" = "torch" ] ; then \ - pip install -e ".[torch]" ; \ + pip install ".[torch]" ; \ elif [ "$INSTALL_TYPE" = "transformer" ] ; then \ - pip install -e ".[transformer]" && \ + pip install ".[transformer]" && \ python -m crawl4ai.model_loader ; \ else \ - pip install -e "." ; \ + pip install "." ; \ fi # Install MkDocs and required plugins diff --git a/docs/md_v2/basic/docker-deploymeny.md b/docs/md_v2/basic/docker-deploymeny.md index cc11d0d9..a500ee21 100644 --- a/docs/md_v2/basic/docker-deploymeny.md +++ b/docs/md_v2/basic/docker-deploymeny.md @@ -1,71 +1,112 @@ -# Docker Deployment +# Docker Deployment 🐳 Crawl4AI provides official Docker images for easy deployment and scalability. This guide covers installation, configuration, and usage of Crawl4AI in Docker environments. -## Quick Start 🚀 +## Docker Compose Setup 🐳 -Pull and run the basic version: +### Basic Usage -```bash -docker pull unclecode/crawl4ai:basic -docker run -p 11235:11235 unclecode/crawl4ai:basic +Create a `docker-compose.yml`: +```yaml +version: '3.8' + +services: + crawl4ai: + image: unclecode/crawl4ai:all + ports: + - "11235:11235" + volumes: + - /dev/shm:/dev/shm + deploy: + resources: + limits: + memory: 4G + restart: unless-stopped ``` -Test the deployment: +Run with: +```bash +docker-compose up -d +``` + +### Secure Mode with API Token + +To enable API authentication, simply set the `CRAWL4AI_API_TOKEN`: +```bash +CRAWL4AI_API_TOKEN=your-secret-token docker-compose up -d +``` + +### Using Environment Variables + +Create a `.env` file for your API tokens: +```env +# Crawl4AI API Security (optional) +CRAWL4AI_API_TOKEN=your-secret-token + +# LLM Provider API Keys +OPENAI_API_KEY=sk-... +ANTHROPIC_API_KEY=sk-ant-... +GOOGLE_API_KEY=... +GEMINI_API_KEY=... +OLLAMA_API_KEY=... + +# Additional Configuration +MAX_CONCURRENT_TASKS=5 +``` + +Docker Compose will automatically load variables from the `.env` file. No additional configuration needed! + +### Testing with API Token + ```python import requests -# Test health endpoint -health = requests.get("http://localhost:11235/health") -print("Health check:", health.json()) +# Initialize headers with token if using secure mode +headers = {} +if api_token := os.getenv('CRAWL4AI_API_TOKEN'): + headers['Authorization'] = f'Bearer {api_token}' -# Test basic crawl +# Test crawl with authentication response = requests.post( "http://localhost:11235/crawl", + headers=headers, json={ "urls": "https://www.nbcnews.com/business", "priority": 10 } ) task_id = response.json()["task_id"] -print("Task ID:", task_id) ``` -## Available Images 🏷️ +### Security Best Practices 🔒 -- `unclecode/crawl4ai:basic` - Basic web crawling capabilities -- `unclecode/crawl4ai:all` - Full installation with all features -- `unclecode/crawl4ai:gpu` - GPU-enabled version for ML features - -## Configuration Options 🔧 - -### Environment Variables - -```bash -docker run -p 11235:11235 \ - -e MAX_CONCURRENT_TASKS=5 \ - -e OPENAI_API_KEY=your_key \ - unclecode/crawl4ai:all +- Add `.env` to your `.gitignore` +- Use different API tokens for development and production +- Rotate API tokens periodically +- Use secure methods to pass tokens in production environments ``` -### Volume Mounting +This addition to your documentation: +1. Shows how to use Docker Compose +2. Explains both secure and non-secure modes +3. Demonstrates environment variable configuration +4. Provides example code for authenticated requests +5. Includes security best practices + + + + + + + + + + + + -Mount a directory for persistent data: -```bash -docker run -p 11235:11235 \ - -v $(pwd)/data:/app/data \ - unclecode/crawl4ai:all -``` -### Resource Limits -Control container resources: -```bash -docker run -p 11235:11235 \ - --memory=4g \ - --cpus=2 \ - unclecode/crawl4ai:all -``` ## Usage Examples 📝 From 4b45b28f256ad62272d5ea75ae898de7882618ba Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 18:44:47 +0800 Subject: [PATCH 037/115] feat(docs): enhance deployment documentation with one-click setup, API security details, and Docker Compose examples --- README.md | 15 ++ docs/examples/docker_example.py | 13 +- docs/md_v2/basic/docker-deploymeny.md | 230 ++++++++++++++++++++------ main.py | 3 + 4 files changed, 207 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index d250f936..a2806304 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,21 @@ cd crawl4ai pip install -e . ``` +## One-Click Deployment 🚀 + +Deploy your own instance of Crawl4AI with one click: + +[![DigitalOcean Referral Badge](https://web-platforms.sfo2.cdn.digitaloceanspaces.com/WWW/Badge%203.svg)](https://www.digitalocean.com/?repo=https://github.com/unclecode/crawl4ai/tree/0.3.74&refcode=a0780f1bdb3d&utm_campaign=Referral_Invite&utm_medium=Referral_Program&utm_source=badge) + + +> 💡 **Recommended specs**: 4GB RAM minimum. Select "professional-xs" or higher when deploying for stable operation. + +The deploy will: +- Set up a Docker container with Crawl4AI +- Configure Playwright and all dependencies +- Start the FastAPI server on port 11235 +- Set up health checks and auto-deployment + ### Using Docker 🐳 Crawl4AI is available as Docker images for easy deployment. You can either pull directly from Docker Hub (recommended) or build from the repository. diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py index 6701f6ac..b43e8ee6 100644 --- a/docs/examples/docker_example.py +++ b/docs/examples/docker_example.py @@ -15,6 +15,8 @@ class Crawl4AiTester: def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]: # Submit crawl job response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers) + if response.status_code == 403: + raise Exception("API token is invalid or missing") task_id = response.json()["task_id"] print(f"Task ID: {task_id}") @@ -45,8 +47,9 @@ class Crawl4AiTester: def test_docker_deployment(version="basic"): tester = Crawl4AiTester( - # base_url="http://localhost:11235" - base_url="https://crawl4ai-sby74.ondigitalocean.app" + base_url="http://localhost:11235" , + # base_url="https://crawl4ai-sby74.ondigitalocean.app", + api_token="test" ) print(f"Testing Crawl4AI Docker {version} version") @@ -83,7 +86,8 @@ def test_basic_crawl(tester: Crawl4AiTester): print("\n=== Testing Basic Crawl ===") request = { "urls": "https://www.nbcnews.com/business", - "priority": 10 + "priority": 10, + "session_id": "test" } result = tester.submit_and_wait(request) @@ -95,7 +99,8 @@ def test_basic_crawl_sync(tester: Crawl4AiTester): print("\n=== Testing Basic Crawl (Sync) ===") request = { "urls": "https://www.nbcnews.com/business", - "priority": 10 + "priority": 10, + "session_id": "test" } result = tester.submit_sync(request) diff --git a/docs/md_v2/basic/docker-deploymeny.md b/docs/md_v2/basic/docker-deploymeny.md index a500ee21..30555708 100644 --- a/docs/md_v2/basic/docker-deploymeny.md +++ b/docs/md_v2/basic/docker-deploymeny.md @@ -1,12 +1,115 @@ -# Docker Deployment 🐳 +# Docker Deployment Crawl4AI provides official Docker images for easy deployment and scalability. This guide covers installation, configuration, and usage of Crawl4AI in Docker environments. -## Docker Compose Setup 🐳 +## Quick Start 🚀 -### Basic Usage +Pull and run the basic version: + +```bash +# Basic run without security +docker pull unclecode/crawl4ai:basic +docker run -p 11235:11235 unclecode/crawl4ai:basic + +# Run with API security enabled +docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:basic +``` + +## API Security 🔒 + +### Understanding CRAWL4AI_API_TOKEN + +The `CRAWL4AI_API_TOKEN` provides optional security for your Crawl4AI instance: + +- If `CRAWL4AI_API_TOKEN` is set: All API endpoints (except `/health`) require authentication +- If `CRAWL4AI_API_TOKEN` is not set: The API is publicly accessible + +```bash +# Secured Instance +docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:all + +# Unsecured Instance +docker run -p 11235:11235 unclecode/crawl4ai:all +``` + +### Making API Calls + +For secured instances, include the token in all requests: + +```python +import requests + +# Setup headers if token is being used +api_token = "your_secret_token" # Same token set in CRAWL4AI_API_TOKEN +headers = {"Authorization": f"Bearer {api_token}"} if api_token else {} + +# Making authenticated requests +response = requests.post( + "http://localhost:11235/crawl", + headers=headers, + json={ + "urls": "https://example.com", + "priority": 10 + } +) + +# Checking task status +task_id = response.json()["task_id"] +status = requests.get( + f"http://localhost:11235/task/{task_id}", + headers=headers +) +``` + +### Using with Docker Compose + +In your `docker-compose.yml`: +```yaml +services: + crawl4ai: + image: unclecode/crawl4ai:all + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional + # ... other configuration +``` + +Then either: +1. Set in `.env` file: +```env +CRAWL4AI_API_TOKEN=your_secret_token +``` + +2. Or set via command line: +```bash +CRAWL4AI_API_TOKEN=your_secret_token docker-compose up +``` + +> **Security Note**: If you enable the API token, make sure to keep it secure and never commit it to version control. The token will be required for all API endpoints except the health check endpoint (`/health`). + +## Configuration Options 🔧 + +### Environment Variables + +You can configure the service using environment variables: + +```bash +# Basic configuration +docker run -p 11235:11235 \ + -e MAX_CONCURRENT_TASKS=5 \ + unclecode/crawl4ai:all + +# With security and LLM support +docker run -p 11235:11235 \ + -e CRAWL4AI_API_TOKEN=your_secret_token \ + -e OPENAI_API_KEY=sk-... \ + -e ANTHROPIC_API_KEY=sk-ant-... \ + unclecode/crawl4ai:all +``` + +### Using Docker Compose (Recommended) 🐳 Create a `docker-compose.yml`: + ```yaml version: '3.8' @@ -15,83 +118,110 @@ services: image: unclecode/crawl4ai:all ports: - "11235:11235" + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API security + - MAX_CONCURRENT_TASKS=5 + # LLM Provider Keys + - OPENAI_API_KEY=${OPENAI_API_KEY:-} + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} volumes: - /dev/shm:/dev/shm deploy: resources: limits: memory: 4G - restart: unless-stopped + reservations: + memory: 1G ``` -Run with: +You can run it in two ways: + +1. Using environment variables directly: ```bash -docker-compose up -d +CRAWL4AI_API_TOKEN=secret123 OPENAI_API_KEY=sk-... docker-compose up ``` -### Secure Mode with API Token - -To enable API authentication, simply set the `CRAWL4AI_API_TOKEN`: -```bash -CRAWL4AI_API_TOKEN=your-secret-token docker-compose up -d -``` - -### Using Environment Variables - -Create a `.env` file for your API tokens: +2. Using a `.env` file (recommended): +Create a `.env` file in the same directory: ```env -# Crawl4AI API Security (optional) -CRAWL4AI_API_TOKEN=your-secret-token +# API Security (optional) +CRAWL4AI_API_TOKEN=your_secret_token -# LLM Provider API Keys +# LLM Provider Keys OPENAI_API_KEY=sk-... ANTHROPIC_API_KEY=sk-ant-... -GOOGLE_API_KEY=... -GEMINI_API_KEY=... -OLLAMA_API_KEY=... -# Additional Configuration +# Other Configuration MAX_CONCURRENT_TASKS=5 ``` -Docker Compose will automatically load variables from the `.env` file. No additional configuration needed! +Then simply run: +```bash +docker-compose up +``` -### Testing with API Token +### Testing the Deployment 🧪 ```python import requests -# Initialize headers with token if using secure mode -headers = {} -if api_token := os.getenv('CRAWL4AI_API_TOKEN'): - headers['Authorization'] = f'Bearer {api_token}' +# For unsecured instances +def test_unsecured(): + # Health check + health = requests.get("http://localhost:11235/health") + print("Health check:", health.json()) -# Test crawl with authentication -response = requests.post( - "http://localhost:11235/crawl", - headers=headers, - json={ - "urls": "https://www.nbcnews.com/business", - "priority": 10 + # Basic crawl + response = requests.post( + "http://localhost:11235/crawl", + json={ + "urls": "https://www.nbcnews.com/business", + "priority": 10 + } + ) + task_id = response.json()["task_id"] + print("Task ID:", task_id) + +# For secured instances +def test_secured(api_token): + headers = {"Authorization": f"Bearer {api_token}"} + + # Basic crawl with authentication + response = requests.post( + "http://localhost:11235/crawl", + headers=headers, + json={ + "urls": "https://www.nbcnews.com/business", + "priority": 10 + } + ) + task_id = response.json()["task_id"] + print("Task ID:", task_id) +``` + +### LLM Extraction Example 🤖 + +When you've configured your LLM provider keys (via environment variables or `.env`), you can use LLM extraction: + +```python +request = { + "urls": "https://example.com", + "extraction_config": { + "type": "llm", + "params": { + "provider": "openai/gpt-4", + "instruction": "Extract main topics from the page" + } } -) -task_id = response.json()["task_id"] +} + +# Make the request (add headers if using API security) +response = requests.post("http://localhost:11235/crawl", json=request) ``` -### Security Best Practices 🔒 +> **Note**: Remember to add `.env` to your `.gitignore` to keep your API keys secure! -- Add `.env` to your `.gitignore` -- Use different API tokens for development and production -- Rotate API tokens periodically -- Use secure methods to pass tokens in production environments -``` -This addition to your documentation: -1. Shows how to use Docker Compose -2. Explains both secure and non-secure modes -3. Demonstrates environment variable configuration -4. Provides example code for authenticated requests -5. Includes security best practices diff --git a/main.py b/main.py index 92b1793b..41788d61 100644 --- a/main.py +++ b/main.py @@ -65,6 +65,7 @@ class CrawlRequest(BaseModel): screenshot: bool = False magic: bool = False extra: Optional[Dict[str, Any]] = {} + session_id: Optional[str] = None @dataclass class TaskInfo: @@ -284,6 +285,7 @@ class CrawlerService: css_selector=request.css_selector, screenshot=request.screenshot, magic=request.magic, + session_id=request.session_id, **request.extra, ) else: @@ -295,6 +297,7 @@ class CrawlerService: css_selector=request.css_selector, screenshot=request.screenshot, magic=request.magic, + session_id=request.session_id, **request.extra, ) From 3a66aa8a60ae7213bb8437003b58a631df208ffb Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 17 Nov 2024 15:30:56 +0800 Subject: [PATCH 038/115] feat(cache): introduce CacheMode and CacheContext for enhanced caching behavior chore(requirements): add colorama dependency refactor(config): add SHOW_DEPRECATION_WARNINGS flag and clean up code fix(docs): update example scripts for clarity and consistency --- crawl4ai/__init__.py | 3 +- crawl4ai/async_crawler_strategy.py | 8 +- crawl4ai/async_webcrawler.3.73.py | 344 +++++++++++++++++++++++++++++ crawl4ai/async_webcrawler.py | 285 +++++++++++++++++------- crawl4ai/cache_context.py | 79 +++++++ crawl4ai/config.py | 3 +- docs/examples/docker_example.py | 5 +- docs/examples/quickstart_async.py | 12 +- requirements.txt | 3 +- tests/docker_example.py | 332 ++++++++++++++++++++++++++++ 10 files changed, 979 insertions(+), 95 deletions(-) create mode 100644 crawl4ai/async_webcrawler.3.73.py create mode 100644 crawl4ai/cache_context.py create mode 100644 tests/docker_example.py diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index e55aaf73..ad9475b4 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -1,6 +1,6 @@ # __init__.py -from .async_webcrawler import AsyncWebCrawler +from .async_webcrawler import AsyncWebCrawler, CacheMode from .models import CrawlResult from .__version__ import __version__ # __version__ = "0.3.73" @@ -8,6 +8,7 @@ from .__version__ import __version__ __all__ = [ "AsyncWebCrawler", "CrawlResult", + "CacheMode", ] def is_sync_version_installed(): diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 83933a35..a67591af 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -669,8 +669,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.accept_downloads: page.on("download", lambda download: asyncio.create_task(self._handle_download(download))) - if self.verbose: - print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...") + # if self.verbose: + # print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...") if self.use_cached_html: cache_file_path = os.path.join( @@ -873,8 +873,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await asyncio.sleep(screenshot_wait_for) screenshot_data = await self.take_screenshot(page) - if self.verbose: - print(f"[LOG] ✅ Crawled {url} successfully!") + # if self.verbose: + # print(f"[LOG] ✅ Crawled {url} successfully!") if self.use_cached_html: cache_file_path = os.path.join( diff --git a/crawl4ai/async_webcrawler.3.73.py b/crawl4ai/async_webcrawler.3.73.py new file mode 100644 index 00000000..03e7a393 --- /dev/null +++ b/crawl4ai/async_webcrawler.3.73.py @@ -0,0 +1,344 @@ +import os +import time +from pathlib import Path +from typing import Optional +import json +import asyncio +from .models import CrawlResult +from .async_database import async_db_manager +from .chunking_strategy import * +from .extraction_strategy import * +from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse +from .content_scrapping_strategy import WebScrapingStrategy +from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD +from .utils import ( + sanitize_input_encode, + InvalidCSSSelectorError, + format_html +) +from .__version__ import __version__ as crawl4ai_version + +class AsyncWebCrawler: + def __init__( + self, + crawler_strategy: Optional[AsyncCrawlerStrategy] = None, + always_by_pass_cache: bool = False, + base_directory: str = str(Path.home()), + **kwargs, + ): + self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( + **kwargs + ) + self.always_by_pass_cache = always_by_pass_cache + # self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") + self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai") + os.makedirs(self.crawl4ai_folder, exist_ok=True) + os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) + self.ready = False + self.verbose = kwargs.get("verbose", False) + + async def __aenter__(self): + await self.crawler_strategy.__aenter__() + await self.awarmup() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb) + + async def awarmup(self): + # Print a message for crawl4ai and its version + if self.verbose: + print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}") + print("[LOG] 🌤️ Warming up the AsyncWebCrawler") + # await async_db_manager.ainit_db() + # # await async_db_manager.initialize() + # await self.arun( + # url="https://google.com/", + # word_count_threshold=5, + # bypass_cache=False, + # verbose=False, + # ) + self.ready = True + if self.verbose: + print("[LOG] 🌞 AsyncWebCrawler is ready to crawl") + + async def arun( + self, + url: str, + word_count_threshold=MIN_WORD_THRESHOLD, + extraction_strategy: ExtractionStrategy = None, + chunking_strategy: ChunkingStrategy = RegexChunking(), + bypass_cache: bool = False, + css_selector: str = None, + screenshot: bool = False, + user_agent: str = None, + verbose=True, + disable_cache: bool = False, + no_cache_read: bool = False, + no_cache_write: bool = False, + **kwargs, + ) -> CrawlResult: + """ + Runs the crawler for a single source: URL (web, local file, or raw HTML). + + Args: + url (str): The URL to crawl. Supported prefixes: + - 'http://' or 'https://': Web URL to crawl. + - 'file://': Local file path to process. + - 'raw:': Raw HTML content to process. + ... [other existing parameters] + + Returns: + CrawlResult: The result of the crawling and processing. + """ + try: + if disable_cache: + bypass_cache = True + no_cache_read = True + no_cache_write = True + + extraction_strategy = extraction_strategy or NoExtractionStrategy() + extraction_strategy.verbose = verbose + if not isinstance(extraction_strategy, ExtractionStrategy): + raise ValueError("Unsupported extraction strategy") + if not isinstance(chunking_strategy, ChunkingStrategy): + raise ValueError("Unsupported chunking strategy") + + word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD) + + async_response: AsyncCrawlResponse = None + cached = None + screenshot_data = None + extracted_content = None + + is_web_url = url.startswith(('http://', 'https://')) + is_local_file = url.startswith("file://") + is_raw_html = url.startswith("raw:") + _url = url if not is_raw_html else "Raw HTML" + + start_time = time.perf_counter() + cached_result = None + if is_web_url and (not bypass_cache or not no_cache_read) and not self.always_by_pass_cache: + cached_result = await async_db_manager.aget_cached_url(url) + + if cached_result: + html = sanitize_input_encode(cached_result.html) + extracted_content = sanitize_input_encode(cached_result.extracted_content or "") + if screenshot: + screenshot_data = cached_result.screenshot + if not screenshot_data: + cached_result = None + if verbose: + print( + f"[LOG] 1️⃣ ✅ Page fetched (cache) for {_url}, success: {bool(html)}, time taken: {time.perf_counter() - start_time:.2f} seconds" + ) + + + if not cached or not html: + t1 = time.perf_counter() + + if user_agent: + self.crawler_strategy.update_user_agent(user_agent) + async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs) + html = sanitize_input_encode(async_response.html) + screenshot_data = async_response.screenshot + t2 = time.perf_counter() + if verbose: + print( + f"[LOG] 1️⃣ ✅ Page fetched (no-cache) for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds" + ) + + t1 = time.perf_counter() + crawl_result = await self.aprocess_html( + url=url, + html=html, + extracted_content=extracted_content, + word_count_threshold=word_count_threshold, + extraction_strategy=extraction_strategy, + chunking_strategy=chunking_strategy, + css_selector=css_selector, + screenshot=screenshot_data, + verbose=verbose, + is_cached=bool(cached), + async_response=async_response, + bypass_cache=bypass_cache, + is_web_url = is_web_url, + is_local_file = is_local_file, + is_raw_html = is_raw_html, + **kwargs, + ) + + if async_response: + crawl_result.status_code = async_response.status_code + crawl_result.response_headers = async_response.response_headers + crawl_result.downloaded_files = async_response.downloaded_files + else: + crawl_result.status_code = 200 + crawl_result.response_headers = cached_result.response_headers if cached_result else {} + + crawl_result.success = bool(html) + crawl_result.session_id = kwargs.get("session_id", None) + + if verbose: + print( + f"[LOG] 🔥 🚀 Crawling done for {_url}, success: {crawl_result.success}, time taken: {time.perf_counter() - start_time:.2f} seconds" + ) + + if not is_raw_html and not no_cache_write: + if not bool(cached_result) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: + await async_db_manager.acache_url(crawl_result) + + + return crawl_result + + except Exception as e: + if not hasattr(e, "msg"): + e.msg = str(e) + print(f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}") + return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}", success=False, error_message=e.msg) + + async def arun_many( + self, + urls: List[str], + word_count_threshold=MIN_WORD_THRESHOLD, + extraction_strategy: ExtractionStrategy = None, + chunking_strategy: ChunkingStrategy = RegexChunking(), + bypass_cache: bool = False, + css_selector: str = None, + screenshot: bool = False, + user_agent: str = None, + verbose=True, + **kwargs, + ) -> List[CrawlResult]: + """ + Runs the crawler for multiple sources: URLs (web, local files, or raw HTML). + + Args: + urls (List[str]): A list of URLs with supported prefixes: + - 'http://' or 'https://': Web URL to crawl. + - 'file://': Local file path to process. + - 'raw:': Raw HTML content to process. + ... [other existing parameters] + + Returns: + List[CrawlResult]: The results of the crawling and processing. + """ + semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed + semaphore = asyncio.Semaphore(semaphore_count) + + async def crawl_with_semaphore(url): + async with semaphore: + return await self.arun( + url, + word_count_threshold=word_count_threshold, + extraction_strategy=extraction_strategy, + chunking_strategy=chunking_strategy, + bypass_cache=bypass_cache, + css_selector=css_selector, + screenshot=screenshot, + user_agent=user_agent, + verbose=verbose, + **kwargs, + ) + + tasks = [crawl_with_semaphore(url) for url in urls] + results = await asyncio.gather(*tasks, return_exceptions=True) + return [result if not isinstance(result, Exception) else str(result) for result in results] + + async def aprocess_html( + self, + url: str, + html: str, + extracted_content: str, + word_count_threshold: int, + extraction_strategy: ExtractionStrategy, + chunking_strategy: ChunkingStrategy, + css_selector: str, + screenshot: str, + verbose: bool, + **kwargs, + ) -> CrawlResult: + t = time.perf_counter() + # Extract content from HTML + try: + _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" + t1 = time.perf_counter() + scrapping_strategy = WebScrapingStrategy() + # result = await scrapping_strategy.ascrap( + result = scrapping_strategy.scrap( + url, + html, + word_count_threshold=word_count_threshold, + css_selector=css_selector, + only_text=kwargs.get("only_text", False), + image_description_min_word_threshold=kwargs.get( + "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD + ), + **kwargs, + ) + + if result is None: + raise ValueError(f"Process HTML, Failed to extract content from the website: {url}") + except InvalidCSSSelectorError as e: + raise ValueError(str(e)) + except Exception as e: + raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") + + cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) + markdown = sanitize_input_encode(result.get("markdown", "")) + fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) + fit_html = sanitize_input_encode(result.get("fit_html", "")) + media = result.get("media", []) + links = result.get("links", []) + metadata = result.get("metadata", {}) + + if verbose: + print( + f"[LOG] 2️⃣ ✅ Scraping done for {_url}, success: True, time taken: {time.perf_counter() - t1:.2f} seconds" + ) + + if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy): + t1 = time.perf_counter() + # Check if extraction strategy is type of JsonCssExtractionStrategy + if isinstance(extraction_strategy, JsonCssExtractionStrategy) or isinstance(extraction_strategy, JsonCssExtractionStrategy): + extraction_strategy.verbose = verbose + extracted_content = extraction_strategy.run(url, [html]) + extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) + else: + sections = chunking_strategy.chunk(markdown) + extracted_content = extraction_strategy.run(url, sections) + extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) + if verbose: + print( + f"[LOG] 3️⃣ ✅ Extraction done for {_url}, time taken: {time.perf_counter() - t1:.2f} seconds" + ) + + screenshot = None if not screenshot else screenshot + + return CrawlResult( + url=url, + html=html, + cleaned_html=format_html(cleaned_html), + markdown=markdown, + fit_markdown=fit_markdown, + fit_html= fit_html, + media=media, + links=links, + metadata=metadata, + screenshot=screenshot, + extracted_content=extracted_content, + success=True, + error_message="", + ) + + async def aclear_cache(self): + # await async_db_manager.aclear_db() + await async_db_manager.cleanup() + + async def aflush_cache(self): + await async_db_manager.aflush_db() + + async def aget_cache_size(self): + return await async_db_manager.aget_total_count() + + diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 03e7a393..d554576d 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -1,7 +1,10 @@ import os import time +import warnings +from enum import Enum +from colorama import init, Fore, Back, Style from pathlib import Path -from typing import Optional +from typing import Optional, List, Union import json import asyncio from .models import CrawlResult @@ -9,8 +12,13 @@ from .async_database import async_db_manager from .chunking_strategy import * from .extraction_strategy import * from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse +from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode from .content_scrapping_strategy import WebScrapingStrategy -from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD +from .config import ( + MIN_WORD_THRESHOLD, + IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, + SHOW_DEPRECATION_WARNINGS # New import +) from .utils import ( sanitize_input_encode, InvalidCSSSelectorError, @@ -18,19 +26,77 @@ from .utils import ( ) from .__version__ import __version__ as crawl4ai_version + class AsyncWebCrawler: + """ + Asynchronous web crawler with flexible caching capabilities. + + Migration Guide (from version X.X.X): + Old way (deprecated): + crawler = AsyncWebCrawler(always_by_pass_cache=True) + result = await crawler.arun( + url="https://example.com", + bypass_cache=True, + no_cache_read=True, + no_cache_write=False + ) + + New way (recommended): + crawler = AsyncWebCrawler(always_bypass_cache=True) + result = await crawler.arun( + url="https://example.com", + cache_mode=CacheMode.WRITE_ONLY + ) + + To disable deprecation warnings: + Set SHOW_DEPRECATION_WARNINGS = False in config.py + """ + def __init__( self, crawler_strategy: Optional[AsyncCrawlerStrategy] = None, - always_by_pass_cache: bool = False, + always_bypass_cache: bool = False, + always_by_pass_cache: Optional[bool] = None, # Deprecated parameter base_directory: str = str(Path.home()), **kwargs, ): - self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( - **kwargs - ) - self.always_by_pass_cache = always_by_pass_cache - # self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") + """ + Initialize the AsyncWebCrawler. + + Args: + crawler_strategy: Strategy for crawling web pages + always_bypass_cache: Whether to always bypass cache (new parameter) + always_by_pass_cache: Deprecated, use always_bypass_cache instead + base_directory: Base directory for storing cache + """ + init() + self.log_width = 10 # Width of "[COMPLETE]" + self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".") + self.log_icons = { + 'INIT': '→', # Alternative: '▶' or '►' + 'READY': '✓', # Alternative: '√' + 'FETCH': '↓', # Alternative: '▼' + 'SCRAPE': '◆', # Alternative: '♦' + 'EXTRACT': '■', # Alternative: '□' + 'COMPLETE': '●', # Alternative: '○' + 'ERROR': '×' + } + self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(**kwargs) + + # Handle deprecated parameter + if always_by_pass_cache is not None: + if SHOW_DEPRECATION_WARNINGS: + warnings.warn( + "'always_by_pass_cache' is deprecated and will be removed in version X.X.X. " + "Use 'always_bypass_cache' instead. " + "Set SHOW_DEPRECATION_WARNINGS=False in config.py to suppress this warning.", + DeprecationWarning, + stacklevel=2 + ) + self.always_bypass_cache = always_by_pass_cache + else: + self.always_bypass_cache = always_bypass_cache + self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai") os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) @@ -46,21 +112,13 @@ class AsyncWebCrawler: await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb) async def awarmup(self): - # Print a message for crawl4ai and its version + """Initialize the crawler with warm-up sequence.""" if self.verbose: - print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}") - print("[LOG] 🌤️ Warming up the AsyncWebCrawler") - # await async_db_manager.ainit_db() - # # await async_db_manager.initialize() - # await self.arun( - # url="https://google.com/", - # word_count_threshold=5, - # bypass_cache=False, - # verbose=False, - # ) + print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Crawl4AI {crawl4ai_version}{Style.RESET_ALL}") + print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Warming up AsyncWebCrawler{Style.RESET_ALL}") self.ready = True if self.verbose: - print("[LOG] 🌞 AsyncWebCrawler is ready to crawl") + print(f"{Fore.GREEN}{self.tag_format('READY')} {self.log_icons['READY']} AsyncWebCrawler initialized{Style.RESET_ALL}") async def arun( self, @@ -68,35 +126,81 @@ class AsyncWebCrawler: word_count_threshold=MIN_WORD_THRESHOLD, extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), + cache_mode: Optional[CacheMode] = None, + # Deprecated parameters bypass_cache: bool = False, + disable_cache: bool = False, + no_cache_read: bool = False, + no_cache_write: bool = False, + # Other parameters css_selector: str = None, screenshot: bool = False, user_agent: str = None, verbose=True, - disable_cache: bool = False, - no_cache_read: bool = False, - no_cache_write: bool = False, **kwargs, ) -> CrawlResult: """ Runs the crawler for a single source: URL (web, local file, or raw HTML). + Migration from legacy cache parameters: + Old way (deprecated): + await crawler.arun(url, bypass_cache=True, no_cache_read=True) + + New way: + await crawler.arun(url, cache_mode=CacheMode.BYPASS) + Args: - url (str): The URL to crawl. Supported prefixes: - - 'http://' or 'https://': Web URL to crawl. - - 'file://': Local file path to process. - - 'raw:': Raw HTML content to process. - ... [other existing parameters] + url: The URL to crawl (http://, https://, file://, or raw:) + cache_mode: Cache behavior control (recommended) + word_count_threshold: Minimum word count threshold + extraction_strategy: Strategy for content extraction + chunking_strategy: Strategy for content chunking + css_selector: CSS selector for content extraction + screenshot: Whether to capture screenshot + user_agent: Custom user agent + verbose: Enable verbose logging + + Deprecated Args: + bypass_cache: Use cache_mode=CacheMode.BYPASS instead + disable_cache: Use cache_mode=CacheMode.DISABLED instead + no_cache_read: Use cache_mode=CacheMode.WRITE_ONLY instead + no_cache_write: Use cache_mode=CacheMode.READ_ONLY instead Returns: - CrawlResult: The result of the crawling and processing. + CrawlResult: The result of crawling and processing """ try: - if disable_cache: - bypass_cache = True - no_cache_read = True - no_cache_write = True + # Handle deprecated parameters + if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): + if SHOW_DEPRECATION_WARNINGS: + warnings.warn( + "Cache control boolean flags are deprecated and will be removed in version X.X.X. " + "Use 'cache_mode' parameter instead. Examples:\n" + "- For bypass_cache=True, use cache_mode=CacheMode.BYPASS\n" + "- For disable_cache=True, use cache_mode=CacheMode.DISABLED\n" + "- For no_cache_read=True, use cache_mode=CacheMode.WRITE_ONLY\n" + "- For no_cache_write=True, use cache_mode=CacheMode.READ_ONLY\n" + "Set SHOW_DEPRECATION_WARNINGS=False in config.py to suppress this warning.", + DeprecationWarning, + stacklevel=2 + ) + + # Convert legacy parameters if cache_mode not provided + if cache_mode is None: + cache_mode = _legacy_to_cache_mode( + disable_cache=disable_cache, + bypass_cache=bypass_cache, + no_cache_read=no_cache_read, + no_cache_write=no_cache_write + ) + # Default to ENABLED if no cache mode specified + if cache_mode is None: + cache_mode = CacheMode.ENABLED + + # Create cache context + cache_context = CacheContext(url, cache_mode, self.always_bypass_cache) + extraction_strategy = extraction_strategy or NoExtractionStrategy() extraction_strategy.verbose = verbose if not isinstance(extraction_strategy, ExtractionStrategy): @@ -107,18 +211,14 @@ class AsyncWebCrawler: word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD) async_response: AsyncCrawlResponse = None - cached = None + cached_result = None screenshot_data = None extracted_content = None - is_web_url = url.startswith(('http://', 'https://')) - is_local_file = url.startswith("file://") - is_raw_html = url.startswith("raw:") - _url = url if not is_raw_html else "Raw HTML" - start_time = time.perf_counter() - cached_result = None - if is_web_url and (not bypass_cache or not no_cache_read) and not self.always_by_pass_cache: + + # Try to get cached result if appropriate + if cache_context.should_read(): cached_result = await async_db_manager.aget_cached_url(url) if cached_result: @@ -129,26 +229,27 @@ class AsyncWebCrawler: if not screenshot_data: cached_result = None if verbose: - print( - f"[LOG] 1️⃣ ✅ Page fetched (cache) for {_url}, success: {bool(html)}, time taken: {time.perf_counter() - start_time:.2f} seconds" - ) + print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s") - if not cached or not html: + # Fetch fresh content if needed + if not cached_result or not html: t1 = time.perf_counter() if user_agent: self.crawler_strategy.update_user_agent(user_agent) - async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs) + async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl( + url, + screenshot=screenshot, + **kwargs + ) html = sanitize_input_encode(async_response.html) screenshot_data = async_response.screenshot t2 = time.perf_counter() if verbose: - print( - f"[LOG] 1️⃣ ✅ Page fetched (no-cache) for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds" - ) + print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") - t1 = time.perf_counter() + # Process the HTML content crawl_result = await self.aprocess_html( url=url, html=html, @@ -159,15 +260,15 @@ class AsyncWebCrawler: css_selector=css_selector, screenshot=screenshot_data, verbose=verbose, - is_cached=bool(cached), + is_cached=bool(cached_result), async_response=async_response, - bypass_cache=bypass_cache, - is_web_url = is_web_url, - is_local_file = is_local_file, - is_raw_html = is_raw_html, + is_web_url=cache_context.is_web_url, + is_local_file=cache_context.is_local_file, + is_raw_html=cache_context.is_raw_html, **kwargs, ) + # Set response data if async_response: crawl_result.status_code = async_response.status_code crawl_result.response_headers = async_response.response_headers @@ -180,22 +281,26 @@ class AsyncWebCrawler: crawl_result.session_id = kwargs.get("session_id", None) if verbose: - print( - f"[LOG] 🔥 🚀 Crawling done for {_url}, success: {crawl_result.success}, time taken: {time.perf_counter() - start_time:.2f} seconds" - ) + print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url} | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") - if not is_raw_html and not no_cache_write: - if not bool(cached_result) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: - await async_db_manager.acache_url(crawl_result) + # Update cache if appropriate + if cache_context.should_write() and not bool(cached_result): + await async_db_manager.acache_url(crawl_result) return crawl_result except Exception as e: if not hasattr(e, "msg"): e.msg = str(e) - print(f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}") - return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}", success=False, error_message=e.msg) + print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url} | {e.msg}{Style.RESET_ALL}") + return CrawlResult( + url=url, + html="", + markdown=f"[ERROR] 🚫 arun(): Failed to crawl {cache_context.display_url}, error: {e.msg}", + success=False, + error_message=e.msg + ) async def arun_many( self, @@ -203,6 +308,8 @@ class AsyncWebCrawler: word_count_threshold=MIN_WORD_THRESHOLD, extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), + cache_mode: Optional[CacheMode] = None, + # Deprecated parameters bypass_cache: bool = False, css_selector: str = None, screenshot: bool = False, @@ -211,19 +318,35 @@ class AsyncWebCrawler: **kwargs, ) -> List[CrawlResult]: """ - Runs the crawler for multiple sources: URLs (web, local files, or raw HTML). + Runs the crawler for multiple URLs concurrently. + + Migration from legacy parameters: + Old way (deprecated): + results = await crawler.arun_many(urls, bypass_cache=True) + + New way: + results = await crawler.arun_many(urls, cache_mode=CacheMode.BYPASS) Args: - urls (List[str]): A list of URLs with supported prefixes: - - 'http://' or 'https://': Web URL to crawl. - - 'file://': Local file path to process. - - 'raw:': Raw HTML content to process. - ... [other existing parameters] + urls: List of URLs to crawl + cache_mode: Cache behavior control (recommended) + [other parameters same as arun()] Returns: - List[CrawlResult]: The results of the crawling and processing. + List[CrawlResult]: Results for each URL """ - semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed + if bypass_cache and SHOW_DEPRECATION_WARNINGS: + warnings.warn( + "'bypass_cache' is deprecated and will be removed in version X.X.X. " + "Use 'cache_mode=CacheMode.BYPASS' instead. " + "Set SHOW_DEPRECATION_WARNINGS=False in config.py to suppress this warning.", + DeprecationWarning, + stacklevel=2 + ) + if cache_mode is None: + cache_mode = CacheMode.BYPASS + + semaphore_count = kwargs.get('semaphore_count', 5) semaphore = asyncio.Semaphore(semaphore_count) async def crawl_with_semaphore(url): @@ -233,7 +356,7 @@ class AsyncWebCrawler: word_count_threshold=word_count_threshold, extraction_strategy=extraction_strategy, chunking_strategy=chunking_strategy, - bypass_cache=bypass_cache, + cache_mode=cache_mode, css_selector=css_selector, screenshot=screenshot, user_agent=user_agent, @@ -245,6 +368,7 @@ class AsyncWebCrawler: results = await asyncio.gather(*tasks, return_exceptions=True) return [result if not isinstance(result, Exception) else str(result) for result in results] + async def aprocess_html( self, url: str, @@ -258,7 +382,6 @@ class AsyncWebCrawler: verbose: bool, **kwargs, ) -> CrawlResult: - t = time.perf_counter() # Extract content from HTML try: _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" @@ -293,9 +416,9 @@ class AsyncWebCrawler: metadata = result.get("metadata", {}) if verbose: - print( - f"[LOG] 2️⃣ ✅ Scraping done for {_url}, success: True, time taken: {time.perf_counter() - t1:.2f} seconds" - ) + print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url}{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms") + + if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy): t1 = time.perf_counter() @@ -309,9 +432,9 @@ class AsyncWebCrawler: extracted_content = extraction_strategy.run(url, sections) extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) if verbose: - print( - f"[LOG] 3️⃣ ✅ Extraction done for {_url}, time taken: {time.perf_counter() - t1:.2f} seconds" - ) + print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url}{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}") + + screenshot = None if not screenshot else screenshot @@ -332,13 +455,15 @@ class AsyncWebCrawler: ) async def aclear_cache(self): - # await async_db_manager.aclear_db() + """Clear the cache database.""" await async_db_manager.cleanup() async def aflush_cache(self): + """Flush the cache database.""" await async_db_manager.aflush_db() async def aget_cache_size(self): + """Get the total number of cached items.""" return await async_db_manager.aget_total_count() diff --git a/crawl4ai/cache_context.py b/crawl4ai/cache_context.py new file mode 100644 index 00000000..429eacc1 --- /dev/null +++ b/crawl4ai/cache_context.py @@ -0,0 +1,79 @@ +from enum import Enum + + +class CacheMode(Enum): + """ + Defines the caching behavior for web crawling operations. + + Modes: + - ENABLED: Normal caching behavior (read and write) + - DISABLED: No caching at all + - READ_ONLY: Only read from cache, don't write + - WRITE_ONLY: Only write to cache, don't read + - BYPASS: Bypass cache for this operation + """ + ENABLED = "enabled" + DISABLED = "disabled" + READ_ONLY = "read_only" + WRITE_ONLY = "write_only" + BYPASS = "bypass" + + +class CacheContext: + """ + Encapsulates cache-related decisions and URL handling. + + This class centralizes all cache-related logic and URL type checking, + making the caching behavior more predictable and maintainable. + """ + def __init__(self, url: str, cache_mode: CacheMode, always_bypass: bool = False): + self.url = url + self.cache_mode = cache_mode + self.always_bypass = always_bypass + self.is_cacheable = url.startswith(('http://', 'https://', 'file://')) + self.is_web_url = url.startswith(('http://', 'https://')) + self.is_local_file = url.startswith("file://") + self.is_raw_html = url.startswith("raw:") + self._url_display = url if not self.is_raw_html else "Raw HTML" + + def should_read(self) -> bool: + """Determines if cache should be read based on context.""" + if self.always_bypass or not self.is_cacheable: + return False + return self.cache_mode in [CacheMode.ENABLED, CacheMode.READ_ONLY] + + def should_write(self) -> bool: + """Determines if cache should be written based on context.""" + if self.always_bypass or not self.is_cacheable: + return False + return self.cache_mode in [CacheMode.ENABLED, CacheMode.WRITE_ONLY] + + @property + def display_url(self) -> str: + """Returns the URL in display format.""" + return self._url_display + + +def _legacy_to_cache_mode( + disable_cache: bool = False, + bypass_cache: bool = False, + no_cache_read: bool = False, + no_cache_write: bool = False +) -> CacheMode: + """ + Converts legacy cache parameters to the new CacheMode enum. + + This is an internal function to help transition from the old boolean flags + to the new CacheMode system. + """ + if disable_cache: + return CacheMode.DISABLED + if bypass_cache: + return CacheMode.BYPASS + if no_cache_read and no_cache_write: + return CacheMode.DISABLED + if no_cache_read: + return CacheMode.WRITE_ONLY + if no_cache_write: + return CacheMode.READ_ONLY + return CacheMode.ENABLED diff --git a/crawl4ai/config.py b/crawl4ai/config.py index 5bc284bf..6b1324dd 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -54,4 +54,5 @@ IMAGE_SCORE_THRESHOLD = 2 MAX_METRICS_HISTORY = 1000 -NEED_MIGRATION = True \ No newline at end of file +NEED_MIGRATION = True +SHOW_DEPRECATION_WARNINGS = True \ No newline at end of file diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py index b43e8ee6..898f14da 100644 --- a/docs/examples/docker_example.py +++ b/docs/examples/docker_example.py @@ -48,8 +48,8 @@ class Crawl4AiTester: def test_docker_deployment(version="basic"): tester = Crawl4AiTester( base_url="http://localhost:11235" , - # base_url="https://crawl4ai-sby74.ondigitalocean.app", - api_token="test" + # base_url="https://api.crawl4ai.com" # just for example + # api_token="test" # just for example ) print(f"Testing Crawl4AI Docker {version} version") @@ -69,6 +69,7 @@ def test_docker_deployment(version="basic"): # Test cases based on version test_basic_crawl(tester) + test_basic_crawl(tester) test_basic_crawl_sync(tester) # if version in ["full", "transformer"]: diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 9c57f57d..d67a8c30 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -71,12 +71,12 @@ async def use_proxy(): "Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example." ) # Uncomment and modify the following lines to use a proxy - # async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler: - # result = await crawler.arun( - # url="https://www.nbcnews.com/business", - # bypass_cache=True - # ) - # print(result.markdown[:500]) # Print first 500 characters + async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + bypass_cache=True + ) + print(result.markdown[:500]) # Print first 500 characters async def capture_and_save_screenshot(url: str, output_path: str): async with AsyncWebCrawler(verbose=True) as crawler: diff --git a/requirements.txt b/requirements.txt index 74e8b3d6..e6294cc5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ beautifulsoup4~=4.12 tf-playwright-stealth~=1.0 xxhash~=3.4 rank-bm25~=0.2 -aiofiles~=24.0 \ No newline at end of file +aiofiles~=24.0 +colorama~=0.4 \ No newline at end of file diff --git a/tests/docker_example.py b/tests/docker_example.py new file mode 100644 index 00000000..658e80fd --- /dev/null +++ b/tests/docker_example.py @@ -0,0 +1,332 @@ +import requests +import json +import time +import sys +import base64 +import os +from typing import Dict, Any + +class Crawl4AiTester: + def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None): + self.base_url = base_url + self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') # Check environment variable as fallback + self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {} + + def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]: + # Submit crawl job + response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers) + if response.status_code == 403: + raise Exception("API token is invalid or missing") + task_id = response.json()["task_id"] + print(f"Task ID: {task_id}") + + # Poll for result + start_time = time.time() + while True: + if time.time() - start_time > timeout: + raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds") + + result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers) + status = result.json() + + if status["status"] == "failed": + print("Task failed:", status.get("error")) + raise Exception(f"Task failed: {status.get('error')}") + + if status["status"] == "completed": + return status + + time.sleep(2) + + def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]: + response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60) + if response.status_code == 408: + raise TimeoutError("Task did not complete within server timeout") + response.raise_for_status() + return response.json() + +def test_docker_deployment(version="basic"): + tester = Crawl4AiTester( + # base_url="http://localhost:11235" , + base_url="https://crawl4ai-sby74.ondigitalocean.app", + api_token="test" + ) + print(f"Testing Crawl4AI Docker {version} version") + + # Health check with timeout and retry + max_retries = 5 + for i in range(max_retries): + try: + health = requests.get(f"{tester.base_url}/health", timeout=10) + print("Health check:", health.json()) + break + except requests.exceptions.RequestException as e: + if i == max_retries - 1: + print(f"Failed to connect after {max_retries} attempts") + sys.exit(1) + print(f"Waiting for service to start (attempt {i+1}/{max_retries})...") + time.sleep(5) + + # Test cases based on version + test_basic_crawl(tester) + test_basic_crawl(tester) + test_basic_crawl_sync(tester) + + # if version in ["full", "transformer"]: + # test_cosine_extraction(tester) + + # test_js_execution(tester) + # test_css_selector(tester) + # test_structured_extraction(tester) + # test_llm_extraction(tester) + # test_llm_with_ollama(tester) + # test_screenshot(tester) + + +def test_basic_crawl(tester: Crawl4AiTester): + print("\n=== Testing Basic Crawl ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 10, + "session_id": "test" + } + + result = tester.submit_and_wait(request) + print(f"Basic crawl result length: {len(result['result']['markdown'])}") + assert result["result"]["success"] + assert len(result["result"]["markdown"]) > 0 + +def test_basic_crawl_sync(tester: Crawl4AiTester): + print("\n=== Testing Basic Crawl (Sync) ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 10, + "session_id": "test" + } + + result = tester.submit_sync(request) + print(f"Basic crawl result length: {len(result['result']['markdown'])}") + assert result['status'] == 'completed' + assert result['result']['success'] + assert len(result['result']['markdown']) > 0 + +def test_js_execution(tester: Crawl4AiTester): + print("\n=== Testing JS Execution ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 8, + "js_code": [ + "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" + ], + "wait_for": "article.tease-card:nth-child(10)", + "crawler_params": { + "headless": True + } + } + + result = tester.submit_and_wait(request) + print(f"JS execution result length: {len(result['result']['markdown'])}") + assert result["result"]["success"] + +def test_css_selector(tester: Crawl4AiTester): + print("\n=== Testing CSS Selector ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 7, + "css_selector": ".wide-tease-item__description", + "crawler_params": { + "headless": True + }, + "extra": {"word_count_threshold": 10} + + } + + result = tester.submit_and_wait(request) + print(f"CSS selector result length: {len(result['result']['markdown'])}") + assert result["result"]["success"] + +def test_structured_extraction(tester: Crawl4AiTester): + print("\n=== Testing Structured Extraction ===") + schema = { + "name": "Coinbase Crypto Prices", + "baseSelector": ".cds-tableRow-t45thuk", + "fields": [ + { + "name": "crypto", + "selector": "td:nth-child(1) h2", + "type": "text", + }, + { + "name": "symbol", + "selector": "td:nth-child(1) p", + "type": "text", + }, + { + "name": "price", + "selector": "td:nth-child(2)", + "type": "text", + } + ], + } + + request = { + "urls": "https://www.coinbase.com/explore", + "priority": 9, + "extraction_config": { + "type": "json_css", + "params": { + "schema": schema + } + } + } + + result = tester.submit_and_wait(request) + extracted = json.loads(result["result"]["extracted_content"]) + print(f"Extracted {len(extracted)} items") + print("Sample item:", json.dumps(extracted[0], indent=2)) + assert result["result"]["success"] + assert len(extracted) > 0 + +def test_llm_extraction(tester: Crawl4AiTester): + print("\n=== Testing LLM Extraction ===") + schema = { + "type": "object", + "properties": { + "model_name": { + "type": "string", + "description": "Name of the OpenAI model." + }, + "input_fee": { + "type": "string", + "description": "Fee for input token for the OpenAI model." + }, + "output_fee": { + "type": "string", + "description": "Fee for output token for the OpenAI model." + } + }, + "required": ["model_name", "input_fee", "output_fee"] + } + + request = { + "urls": "https://openai.com/api/pricing", + "priority": 8, + "extraction_config": { + "type": "llm", + "params": { + "provider": "openai/gpt-4o-mini", + "api_token": os.getenv("OPENAI_API_KEY"), + "schema": schema, + "extraction_type": "schema", + "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens.""" + } + }, + "crawler_params": {"word_count_threshold": 1} + } + + try: + result = tester.submit_and_wait(request) + extracted = json.loads(result["result"]["extracted_content"]) + print(f"Extracted {len(extracted)} model pricing entries") + print("Sample entry:", json.dumps(extracted[0], indent=2)) + assert result["result"]["success"] + except Exception as e: + print(f"LLM extraction test failed (might be due to missing API key): {str(e)}") + +def test_llm_with_ollama(tester: Crawl4AiTester): + print("\n=== Testing LLM with Ollama ===") + schema = { + "type": "object", + "properties": { + "article_title": { + "type": "string", + "description": "The main title of the news article" + }, + "summary": { + "type": "string", + "description": "A brief summary of the article content" + }, + "main_topics": { + "type": "array", + "items": {"type": "string"}, + "description": "Main topics or themes discussed in the article" + } + } + } + + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 8, + "extraction_config": { + "type": "llm", + "params": { + "provider": "ollama/llama2", + "schema": schema, + "extraction_type": "schema", + "instruction": "Extract the main article information including title, summary, and main topics." + } + }, + "extra": {"word_count_threshold": 1}, + "crawler_params": {"verbose": True} + } + + try: + result = tester.submit_and_wait(request) + extracted = json.loads(result["result"]["extracted_content"]) + print("Extracted content:", json.dumps(extracted, indent=2)) + assert result["result"]["success"] + except Exception as e: + print(f"Ollama extraction test failed: {str(e)}") + +def test_cosine_extraction(tester: Crawl4AiTester): + print("\n=== Testing Cosine Extraction ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 8, + "extraction_config": { + "type": "cosine", + "params": { + "semantic_filter": "business finance economy", + "word_count_threshold": 10, + "max_dist": 0.2, + "top_k": 3 + } + } + } + + try: + result = tester.submit_and_wait(request) + extracted = json.loads(result["result"]["extracted_content"]) + print(f"Extracted {len(extracted)} text clusters") + print("First cluster tags:", extracted[0]["tags"]) + assert result["result"]["success"] + except Exception as e: + print(f"Cosine extraction test failed: {str(e)}") + +def test_screenshot(tester: Crawl4AiTester): + print("\n=== Testing Screenshot ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 5, + "screenshot": True, + "crawler_params": { + "headless": True + } + } + + result = tester.submit_and_wait(request) + print("Screenshot captured:", bool(result["result"]["screenshot"])) + + if result["result"]["screenshot"]: + # Save screenshot + screenshot_data = base64.b64decode(result["result"]["screenshot"]) + with open("test_screenshot.jpg", "wb") as f: + f.write(screenshot_data) + print("Screenshot saved as test_screenshot.jpg") + + assert result["result"]["success"] + +if __name__ == "__main__": + version = sys.argv[1] if len(sys.argv) > 1 else "basic" + # version = "full" + test_docker_deployment(version) \ No newline at end of file From 3a524a3bdd3afdd58d64c336031e7687fdfe5631 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 17 Nov 2024 16:00:39 +0800 Subject: [PATCH 039/115] fix(docs): remove unnecessary blank line in README for improved readability --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index a2806304..069c02b8 100644 --- a/README.md +++ b/README.md @@ -119,7 +119,6 @@ Deploy your own instance of Crawl4AI with one click: [![DigitalOcean Referral Badge](https://web-platforms.sfo2.cdn.digitaloceanspaces.com/WWW/Badge%203.svg)](https://www.digitalocean.com/?repo=https://github.com/unclecode/crawl4ai/tree/0.3.74&refcode=a0780f1bdb3d&utm_campaign=Referral_Invite&utm_medium=Referral_Program&utm_source=badge) - > 💡 **Recommended specs**: 4GB RAM minimum. Select "professional-xs" or higher when deploying for stable operation. The deploy will: From 2a82455b3dd3427f3099e201c2d88fadcc0c78fc Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 17 Nov 2024 17:17:34 +0800 Subject: [PATCH 040/115] feat(crawl): implement direct crawl functionality and introduce CacheMode for improved caching control --- docs/examples/docker_example.py | 33 ++++++++++++-- docs/md_v2/basic/cache-modes.md | 79 +++++++++++++++++++++++++++++++++ main.py | 46 ++++++++++++++++++- mkdocs.yml | 1 + 4 files changed, 153 insertions(+), 6 deletions(-) create mode 100644 docs/md_v2/basic/cache-modes.md diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py index 898f14da..17ef9f04 100644 --- a/docs/examples/docker_example.py +++ b/docs/examples/docker_example.py @@ -9,7 +9,7 @@ from typing import Dict, Any class Crawl4AiTester: def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None): self.base_url = base_url - self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') # Check environment variable as fallback + self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code" # Check environment variable as fallback self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {} def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]: @@ -44,6 +44,16 @@ class Crawl4AiTester: raise TimeoutError("Task did not complete within server timeout") response.raise_for_status() return response.json() + + def crawl_direct(self, request_data: Dict[str, Any]) -> Dict[str, Any]: + """Directly crawl without using task queue""" + response = requests.post( + f"{self.base_url}/crawl_direct", + json=request_data, + headers=self.headers + ) + response.raise_for_status() + return response.json() def test_docker_deployment(version="basic"): tester = Crawl4AiTester( @@ -68,9 +78,10 @@ def test_docker_deployment(version="basic"): time.sleep(5) # Test cases based on version - test_basic_crawl(tester) - test_basic_crawl(tester) - test_basic_crawl_sync(tester) + # test_basic_crawl(tester) + # test_basic_crawl(tester) + # test_basic_crawl_sync(tester) + test_basic_crawl_direct(tester) # if version in ["full", "transformer"]: # test_cosine_extraction(tester) @@ -110,6 +121,20 @@ def test_basic_crawl_sync(tester: Crawl4AiTester): assert result['result']['success'] assert len(result['result']['markdown']) > 0 +def test_basic_crawl_direct(tester: Crawl4AiTester): + print("\n=== Testing Basic Crawl (Direct) ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 10, + # "session_id": "test" + "cache_mode": "bypass" # or "enabled", "disabled", "read_only", "write_only" + } + + result = tester.crawl_direct(request) + print(f"Basic crawl result length: {len(result['result']['markdown'])}") + assert result['result']['success'] + assert len(result['result']['markdown']) > 0 + def test_js_execution(tester: Crawl4AiTester): print("\n=== Testing JS Execution ===") request = { diff --git a/docs/md_v2/basic/cache-modes.md b/docs/md_v2/basic/cache-modes.md new file mode 100644 index 00000000..04a4f218 --- /dev/null +++ b/docs/md_v2/basic/cache-modes.md @@ -0,0 +1,79 @@ +# Crawl4AI Cache System and Migration Guide + +## Overview +Starting from version X.X.X, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable. + +## Old vs New Approach + +### Old Way (Deprecated) +The old system used multiple boolean flags: +- `bypass_cache`: Skip cache entirely +- `disable_cache`: Disable all caching +- `no_cache_read`: Don't read from cache +- `no_cache_write`: Don't write to cache + +### New Way (Recommended) +The new system uses a single `CacheMode` enum: +- `CacheMode.ENABLED`: Normal caching (read/write) +- `CacheMode.DISABLED`: No caching at all +- `CacheMode.READ_ONLY`: Only read from cache +- `CacheMode.WRITE_ONLY`: Only write to cache +- `CacheMode.BYPASS`: Skip cache for this operation + +## Migration Example + +### Old Code (Deprecated) +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def use_proxy(): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + bypass_cache=True # Old way + ) + print(len(result.markdown)) + +async def main(): + await use_proxy() + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### New Code (Recommended) +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CacheMode # Import CacheMode + +async def use_proxy(): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + cache_mode=CacheMode.BYPASS # New way + ) + print(len(result.markdown)) + +async def main(): + await use_proxy() + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## Common Migration Patterns + +Old Flag | New Mode +---------|---------- +`bypass_cache=True` | `cache_mode=CacheMode.BYPASS` +`disable_cache=True` | `cache_mode=CacheMode.DISABLED` +`no_cache_read=True` | `cache_mode=CacheMode.WRITE_ONLY` +`no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` + +## Suppressing Deprecation Warnings +If you need time to migrate, you can temporarily suppress deprecation warnings: +```python +# In your config.py +SHOW_DEPRECATION_WARNINGS = False +``` diff --git a/main.py b/main.py index 41788d61..ee5f7fc6 100644 --- a/main.py +++ b/main.py @@ -25,7 +25,7 @@ import logging from enum import Enum from dataclasses import dataclass import json -from crawl4ai import AsyncWebCrawler, CrawlResult +from crawl4ai import AsyncWebCrawler, CrawlResult, CacheMode from crawl4ai.extraction_strategy import ( LLMExtractionStrategy, CosineStrategy, @@ -66,6 +66,7 @@ class CrawlRequest(BaseModel): magic: bool = False extra: Optional[Dict[str, Any]] = {} session_id: Optional[str] = None + cache_mode: Optional[CacheMode] = None @dataclass class TaskInfo: @@ -329,7 +330,7 @@ app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages" # API token security security = HTTPBearer() -CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") +CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code" async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)): if not CRAWL4AI_API_TOKEN: @@ -419,6 +420,47 @@ async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]: # If we get here, task didn't complete within timeout raise HTTPException(status_code=408, detail="Task timed out") +@app.post("/crawl_direct", dependencies=[Depends(verify_token)]) +async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]: + try: + crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params) + extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config) + + try: + if isinstance(request.urls, list): + results = await crawler.arun_many( + urls=[str(url) for url in request.urls], + extraction_strategy=extraction_strategy, + js_code=request.js_code, + wait_for=request.wait_for, + css_selector=request.css_selector, + screenshot=request.screenshot, + magic=request.magic, + cache_mode=request.cache_mode, + session_id=request.session_id, + **request.extra, + ) + return {"results": [result.dict() for result in results]} + else: + result = await crawler.arun( + url=str(request.urls), + extraction_strategy=extraction_strategy, + js_code=request.js_code, + wait_for=request.wait_for, + css_selector=request.css_selector, + screenshot=request.screenshot, + magic=request.magic, + cache_mode=request.cache_mode, + session_id=request.session_id, + **request.extra, + ) + return {"result": result.dict()} + finally: + await crawler_service.crawler_pool.release(crawler) + except Exception as e: + logger.error(f"Error in direct crawl: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + @app.get("/health") async def health_check(): available_slots = await crawler_service.resource_monitor.get_available_slots() diff --git a/mkdocs.yml b/mkdocs.yml index b09cb9eb..1b26b9df 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -17,6 +17,7 @@ nav: - 'Browser Configuration': 'basic/browser-config.md' - 'Page Interaction': 'basic/page-interaction.md' - 'Content Selection': 'basic/content-selection.md' + - 'Cache Modes': 'basic/cache-modes.md' - Advanced: - 'Content Processing': 'advanced/content-processing.md' From f9fe6f89feafeba175dc35da64ca5f6883839473 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 17 Nov 2024 18:09:33 +0800 Subject: [PATCH 041/115] feat(database): implement version management and migration checks during initialization --- crawl4ai/async_database.py | 39 +++++++++++++++++++++++++-- crawl4ai/version_manager.py | 30 +++++++++++++++++++++ docs/md_v2/basic/installation.md | 45 ++++++++++++++++++++++++++++++++ setup.py | 2 +- 4 files changed, 113 insertions(+), 3 deletions(-) create mode 100644 crawl4ai/version_manager.py diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index f97d8131..7809dfe1 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -11,6 +11,7 @@ from .models import CrawlResult import xxhash import aiofiles from .config import NEED_MIGRATION +from .version_manager import VersionManager # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -28,22 +29,49 @@ class AsyncDatabaseManager: self.connection_pool: Dict[int, aiosqlite.Connection] = {} self.pool_lock = asyncio.Lock() self.connection_semaphore = asyncio.Semaphore(pool_size) + self._initialized = False + self.version_manager = VersionManager() + async def initialize(self): """Initialize the database and connection pool""" try: logger.info("Initializing database...") + # Ensure the database file exists + os.makedirs(os.path.dirname(self.db_path), exist_ok=True) + + # Check if version update is needed + needs_update = self.version_manager.needs_update() + + # Always ensure base table exists await self.ainit_db() - if NEED_MIGRATION: + + # Verify the table exists + async def verify_table(db): + async with db.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'" + ) as cursor: + result = await cursor.fetchone() + if not result: + raise Exception("crawled_data table was not created") + + await self.execute_with_retry(verify_table) + + # If version changed or fresh install, run updates + if needs_update: + logger.info("New version detected, running updates...") await self.update_db_schema() from .migrations import run_migration # Import here to avoid circular imports await run_migration() - logger.info("Database initialization and migration completed successfully") + self.version_manager.update_version() # Update stored version after successful migration + logger.info("Version update completed successfully") else: logger.info("Database initialization completed successfully") + except Exception as e: logger.error(f"Database initialization error: {e}") logger.info("Database will be initialized on first use") + raise async def cleanup(self): """Cleanup connections when shutting down""" @@ -55,6 +83,12 @@ class AsyncDatabaseManager: @asynccontextmanager async def get_connection(self): """Connection pool manager""" + if not self._initialized: + async with self.pool_lock: # Prevent multiple simultaneous initializations + if not self._initialized: # Double-check after acquiring lock + await self.initialize() + self._initialized = True + async with self.connection_semaphore: task_id = id(asyncio.current_task()) try: @@ -79,6 +113,7 @@ class AsyncDatabaseManager: await self.connection_pool[task_id].close() del self.connection_pool[task_id] + async def execute_with_retry(self, operation, *args): """Execute database operations with retry logic""" for attempt in range(self.max_retries): diff --git a/crawl4ai/version_manager.py b/crawl4ai/version_manager.py new file mode 100644 index 00000000..07e0c0e9 --- /dev/null +++ b/crawl4ai/version_manager.py @@ -0,0 +1,30 @@ +# version_manager.py +import os +from pathlib import Path +from packaging import version +from . import __version__ + +class VersionManager: + def __init__(self): + self.home_dir = Path.home() / ".crawl4ai" + self.version_file = self.home_dir / "version.txt" + + def get_installed_version(self): + """Get the version recorded in home directory""" + if not self.version_file.exists(): + return None + try: + return version.parse(self.version_file.read_text().strip()) + except: + return None + + def update_version(self): + """Update the version file to current library version""" + self.version_file.write_text(__version__) + + def needs_update(self): + """Check if database needs update based on version""" + installed = self.get_installed_version() + current = version.parse(__version__) + return installed is None or installed < current + diff --git a/docs/md_v2/basic/installation.md b/docs/md_v2/basic/installation.md index a4a60857..de8aeafa 100644 --- a/docs/md_v2/basic/installation.md +++ b/docs/md_v2/basic/installation.md @@ -58,6 +58,51 @@ crawl4ai-download-models This is optional but will boost the performance and speed of the crawler. You only need to do this once after installation. +## Playwright Installation Note for Ubuntu + +If you encounter issues with Playwright installation on Ubuntu, you may need to install additional dependencies: + +```bash +sudo apt-get install -y \ + libwoff1 \ + libopus0 \ + libwebp7 \ + libwebpdemux2 \ + libenchant-2-2 \ + libgudev-1.0-0 \ + libsecret-1-0 \ + libhyphen0 \ + libgdk-pixbuf2.0-0 \ + libegl1 \ + libnotify4 \ + libxslt1.1 \ + libevent-2.1-7 \ + libgles2 \ + libxcomposite1 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libepoxy0 \ + libgtk-3-0 \ + libharfbuzz-icu0 \ + libgstreamer-gl1.0-0 \ + libgstreamer-plugins-bad1.0-0 \ + gstreamer1.0-plugins-good \ + gstreamer1.0-plugins-bad \ + libxt6 \ + libxaw7 \ + xvfb \ + fonts-noto-color-emoji \ + libfontconfig \ + libfreetype6 \ + xfonts-cyrillic \ + xfonts-scalable \ + fonts-liberation \ + fonts-ipafont-gothic \ + fonts-wqy-zenhei \ + fonts-tlwg-loma-otf \ + fonts-freefont-ttf +``` + ## Option 2: Using Docker (Coming Soon) Docker support for Crawl4AI is currently in progress and will be available soon. This will allow you to run Crawl4AI in a containerized environment, ensuring consistency across different systems. diff --git a/setup.py b/setup.py index d8ad2cd3..bbc03026 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ class PostInstallCommand(install): def run(self): install.run(self) install_playwright() - run_migration() + # run_migration() setup( name="Crawl4AI", From a59c107b237ccdab1036f08123421f2645a628f3 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 17 Nov 2024 18:42:43 +0800 Subject: [PATCH 042/115] Update changelog for 0.3.74 --- CHANGELOG.md | 220 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 201 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e82fa6a2..8e5cc91a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,194 @@ # Changelog +## [0.3.74] November 17, 2024 -## Version 0.3.74, Major Changes +This changelog details the updates and changes introduced in Crawl4AI version 0.3.74. It's designed to inform developers about new features, modifications to existing components, removals, and other important information. + +### 1. File Download Processing + +- Users can now specify download folders using the `downloads_path` parameter in the `AsyncWebCrawler` constructor or the `arun` method. If not specified, downloads are saved to a "downloads" folder within the `.crawl4ai` directory. +- File download tracking is integrated into the `CrawlResult` object. Successfully downloaded files are listed in the `downloaded_files` attribute, providing their paths. +- Added `accept_downloads` parameter to the crawler strategies (defaults to `False`). If set to True you can add JS code and `wait_for` parameter for file download. + +**Example:** + +```python +import asyncio +import os +from pathlib import Path +from crawl4ai import AsyncWebCrawler + +async def download_example(): + downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads") + os.makedirs(downloads_path, exist_ok=True) + + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=downloads_path, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) { downloadLink.click(); } + """, + wait_for=5 # To ensure download has started + ) + + if result.downloaded_files: + print("Downloaded files:") + for file in result.downloaded_files: + print(f"- {file}") + +asyncio.run(download_example()) + +``` + +### 2. Refined Content Filtering + +- Introduced the `RelevanceContentFilter` strategy (and its implementation `BM25ContentFilter`) for extracting relevant content from web pages, replacing Fit Markdown and other content cleaning strategy. This new strategy leverages the BM25 algorithm to identify chunks of text relevant to the page's title, description, keywords, or a user-provided query. +- The `fit_markdown` flag in the content scraper is used to filter content based on title, meta description, and keywords. + +**Example:** + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.content_filter_strategy import BM25ContentFilter + +async def filter_content(url, query): + async with AsyncWebCrawler() as crawler: + content_filter = BM25ContentFilter(user_query=query) + result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) + print(result.extracted_content) # Or result.fit_markdown for the markdown version + print(result.fit_html) # Or result.fit_html to show HTML with only the filtered content + +asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple", "fruit nutrition health")) +``` + +### 3. Raw HTML and Local File Support + +- Added support for crawling local files and raw HTML content directly. +- Use the `file://` prefix for local file paths. +- Use the `raw:` prefix for raw HTML strings. + +**Example:** + +```python +async def crawl_local_or_raw(crawler, content, content_type): + prefix = "file://" if content_type == "local" else "raw:" + url = f"{prefix}{content}" + result = await crawler.arun(url=url) + if result.success: + print(f"Markdown Content from {content_type.title()} Source:") + print(result.markdown) + +# Example usage with local file and raw HTML +async def main(): + async with AsyncWebCrawler() as crawler: + # Local File + await crawl_local_or_raw( + crawler, os.path.abspath('tests/async/sample_wikipedia.html'), "local" + ) + # Raw HTML + await crawl_raw_html(crawler, "

    Raw Test

    This is raw HTML.

    ") + + +asyncio.run(main()) +``` + +### 4. Browser Management + +- New asynchronous crawler strategy implemented using Playwright. +- `ManagedBrowser` class introduced for improved browser session handling, offering features like persistent browser sessions between requests (using `session_id` parameter) and browser process monitoring. +- Updated to tf-playwright-stealth for enhanced stealth capabilities. +- Added `use_managed_browser`, `use_persistent_context`, and `chrome_channel` parameters to AsyncPlaywrightCrawlerStrategy. + + +**Example:** +```python +async def browser_management_demo(): + user_data_dir = os.path.join(Path.home(), ".crawl4ai", "user-data-dir") + os.makedirs(user_data_dir, exist_ok=True) # Ensure directory exists + async with AsyncWebCrawler( + use_managed_browser=True, + user_data_dir=user_data_dir, + use_persistent_context=True, + verbose=True + ) as crawler: + result1 = await crawler.arun( + url="https://example.com", session_id="my_session" + ) + result2 = await crawler.arun( + url="https://example.com/anotherpage", session_id="my_session" + ) + +asyncio.run(browser_management_demo()) +``` + + +### 5. API Server & Cache Improvements + +- Added CORS support to API server. +- Implemented static file serving. +- Enhanced root redirect functionality. +- Cache database updated to store response headers and downloaded files information. It utilizes a file system approach to manage large content efficiently. +- New, more efficient caching database built using xxhash and file system approach. +- Introduced `CacheMode` enum (`ENABLED`, `DISABLED`, `READ_ONLY`, `WRITE_ONLY`, `BYPASS`) and `always_bypass_cache` parameter in AsyncWebCrawler for fine-grained cache control. This replaces `bypass_cache`, `no_cache_read`, `no_cache_write`, and `always_by_pass_cache`. + + +### 🗑️ Removals + +- Removed deprecated: `crawl4ai/content_cleaning_strategy.py`. +- Removed internal class ContentCleaningStrategy +- Removed legacy cache control flags: `bypass_cache`, `disable_cache`, `no_cache_read`, `no_cache_write`, and `always_by_pass_cache`. These have been superseded by `cache_mode`. + + +### ⚙️ Other Changes + +- Moved version file to `crawl4ai/__version__.py`. +- Added `crawl4ai/cache_context.py`. +- Added `crawl4ai/version_manager.py`. +- Added `crawl4ai/migrations.py`. +- Added `crawl4ai-migrate` entry point. +- Added config `NEED_MIGRATION` and `SHOW_DEPRECATION_WARNINGS`. +- API server now requires an API token for authentication, configurable with the `CRAWL4AI_API_TOKEN` environment variable. This enhances API security. +- Added synchronous crawl endpoint `/crawl_sync` for immediate result retrieval, and direct crawl endpoint `/crawl_direct` bypassing the task queue. + + +### ⚠️ Deprecation Notices + +- The synchronous version of `WebCrawler` is being phased out. While still available via `crawl4ai[sync]`, it will eventually be removed. Transition to `AsyncWebCrawler` is strongly recommended. Boolean cache control flags in `arun` are also deprecated, migrate to using the `cache_mode` parameter. See examples in the "New Features" section above for correct usage. + + +### 🐛 Bug Fixes + +- Resolved issue with browser context closing unexpectedly in Docker. This significantly improves stability, particularly within containerized environments. +- Fixed memory leaks associated with incorrect asynchronous cleanup by removing the `__del__` method and ensuring the browser context is closed explicitly using context managers. +- Improved error handling in `WebScrapingStrategy`. More detailed error messages and suggestions for debugging will minimize frustration when running into unexpected issues. +- Fixed issue with incorrect text parsing in specific HTML structures. + + +### Example of migrating to the new CacheMode: + +**Old way:** + +```python +crawler = AsyncWebCrawler(always_by_pass_cache=True) +result = await crawler.arun(url="https://example.com", bypass_cache=True) +``` + +**New way:** + +```python +from crawl4ai import CacheMode + +crawler = AsyncWebCrawler(always_bypass_cache=True) +result = await crawler.arun(url="https://example.com", cache_mode=CacheMode.BYPASS) +``` + + +## [0.3.74] - November 13, 2024 1. **File Download Processing** (Nov 14, 2024) - Added capability for users to specify download folders @@ -30,14 +217,9 @@ - Implemented static file serving - Enhanced root redirect functionality -# [0.3.74] November 14, 2024 - -- In this commit, the library is updated to process file downloads. Users can now specify a download folder and trigger the download process via JavaScript or other means, with all files being saved. The list of downloaded files will also be added to the crowd result object. -- Another thing this commit introduces is the concept of the Relevance Content Filter. This is an improvement over Fit Markdown. This class of strategies aims to extract the main content from a given page - the part that really matters and is useful to be processed. One strategy has been created using the BM25 algorithm, which finds chunks of text from the web page relevant to its title, descriptions, and keywords, or supports a given user query and matches them. The result is then returned to the main engine to be converted to Markdown. Plans include adding approaches using language models as well. -- The cache database was updated to hold information about response headers and downloaded files. -# Changelog - November 13, 2024 +## [0.3.731] - November 13, 2024 ### Added - Support for raw HTML and local file crawling via URL prefixes ('raw:', 'file://') @@ -137,7 +319,7 @@ - Modified database connection management approach - Updated API response structure for better consistency -## Migration Guide +### Migration Guide When upgrading to v0.3.73, be aware of the following changes: 1. Docker Deployment: @@ -159,7 +341,7 @@ When upgrading to v0.3.73, be aware of the following changes: - Follow recommended fixes for any identified problems -## [2024-11-04 - 13:21:42] Comprehensive Update of Crawl4AI Features and Dependencies +## [v0.3.73] - 2024-11-04 This commit introduces several key enhancements, including improved error handling and robust database operations in `async_database.py`, which now features a connection pool and retry logic for better reliability. Updates to the README.md provide clearer instructions and a better user experience with links to documentation sections. The `.gitignore` file has been refined to include additional directories, while the async web crawler now utilizes a managed browser for more efficient crawling. Furthermore, multiple dependency updates and introduction of the `CustomHTML2Text` class enhance text extraction capabilities. ## [v0.3.73] - 2024-10-24 @@ -405,43 +587,43 @@ These updates aim to provide more flexibility in text processing, improve perfor - Allows retrieval of content after a specified delay, useful for dynamically loaded content. - **How to use**: Access `result.get_delayed_content(delay_in_seconds)` after crawling. -## Improvements and Optimizations +### Improvements and Optimizations -### 1. AsyncWebCrawler Enhancements +#### 1. AsyncWebCrawler Enhancements - **Flexible Initialization**: Now accepts arbitrary keyword arguments, passed directly to the crawler strategy. - Allows for more customized setups. -### 2. Image Processing Optimization +#### 2. Image Processing Optimization - Enhanced image handling in WebScrapingStrategy. - Added filtering for small, invisible, or irrelevant images. - Improved image scoring system for better content relevance. - Implemented JavaScript-based image dimension updating for more accurate representation. -### 3. Database Schema Auto-updates +#### 3. Database Schema Auto-updates - Automatic database schema updates ensure compatibility with the latest version. -### 4. Enhanced Error Handling and Logging +#### 4. Enhanced Error Handling and Logging - Improved error messages and logging for easier debugging. -### 5. Content Extraction Refinements +#### 5. Content Extraction Refinements - Refined HTML sanitization process. - Improved handling of base64 encoded images. - Enhanced Markdown conversion process. - Optimized content extraction algorithms. -### 6. Utility Function Enhancements +#### 6. Utility Function Enhancements - `perform_completion_with_backoff` function now supports additional arguments for more customized API calls to LLM providers. -## Bug Fixes +### Bug Fixes - Fixed an issue where image tags were being prematurely removed during content extraction. -## Examples and Documentation +### Examples and Documentation - Updated `quickstart_async.py` with examples of: - Using custom headers in LLM extraction. - Different LLM provider usage (OpenAI, Hugging Face, Ollama). - Custom browser type usage. -## Developer Notes +### Developer Notes - Refactored code for better maintainability, flexibility, and performance. - Enhanced type hinting throughout the codebase for improved development experience. - Expanded error handling for more robust operation. From df63a4060673b2d5647abdce07810e29cf20e739 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 17 Nov 2024 19:44:45 +0800 Subject: [PATCH 043/115] feat(docs): update examples and documentation to replace bypass_cache with cache_mode for improved clarity --- README.md | 24 ++- crawl4ai/async_webcrawler.py | 37 +++-- crawl4ai/content_scrapping_strategy.py | 15 +- docs/examples/v0.3.74.overview.py | 2 +- docs/md_v2/advanced/managed_browser.md | 84 ++++++++++ .../advanced/session-management-advanced.md | 10 +- docs/md_v2/advanced/session-management.md | 2 +- docs/md_v2/api/arun.md | 42 +++-- docs/md_v2/api/crawl-result.md | 1 + docs/md_v2/api/parameters.md | 3 +- docs/md_v2/basic/content_filtering.md | 84 ++++++++++ docs/md_v2/basic/file-download.md | 148 ++++++++++++++++++ docs/md_v2/basic/quickstart.md | 22 +-- docs/md_v2/basic/simple-crawling.md | 10 +- .../episode_11_2_Extraction_Strategies_LLM.md | 4 +- ...isode_11_3_Extraction_Strategies_Cosine.md | 4 +- docs/md_v2/tutorial/tutorial.md | 10 +- 17 files changed, 422 insertions(+), 80 deletions(-) create mode 100644 docs/md_v2/advanced/managed_browser.md create mode 100644 docs/md_v2/basic/content_filtering.md create mode 100644 docs/md_v2/basic/file-download.md diff --git a/README.md b/README.md index 069c02b8..9c3796cd 100644 --- a/README.md +++ b/README.md @@ -11,21 +11,19 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. 🆓🌐 -## 🌟 Meet the Crawl4AI Assistant: Your Copilot for Crawling +## New in 0.3.74 ✨ -Use the [Crawl4AI GPT Assistant](https://tinyurl.com/crawl4ai-gpt) as your AI-powered copilot! With this assistant, you can: +- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster, often completing in under 100 milliseconds (excluding web fetch time)! +- 📥 **Download Mastery:** Control downloads, specify folders, and track files within the `CrawlResult` object. +- 🔎 **Relevance Filtering:** Extract the most important content with the new `RelevanceContentFilter` and BM25 algorithm. Control filtering with the `fit_markdown` flag. +- 🗂️ **Local & Raw HTML:** Crawl local files (`file://`) and raw HTML strings (`raw:`) directly. +- 🤖 **Browser Boss:** Manage browser sessions with persistent contexts, process monitoring, and tf-playwright-stealth integration. Configure using `use_managed_browser`, `user_data_dir`, and `use_persistent_context` parameters. +- ☁️ **API & Cache Boost:** CORS support, static file serving, and a new filesystem-based cache for blazing-fast performance. Fine-tune caching with the `CacheMode` enum (ENABLED, DISABLED, READ_ONLY, WRITE_ONLY, BYPASS) and the `always_bypass_cache` parameter. +- 🔒 **API Security:** Protect your API server with token-based authentication using the `CRAWL4AI_API_TOKEN` environment variable. +- 🔄 **Synchronous & Direct Crawling:** Get immediate results with `/crawl_sync` or bypass the task queue with `/crawl_direct`. +- 🛠️ **Database Migration:** A new `crawl4ai-migrate` command ensures smooth upgrades and data integrity between versions. +- 🐛 **Squashed Bugs:** Fixed browser context issues in Docker, memory leaks, enhanced error handling, and improved HTML parsing. -- 🧑‍💻 Generate code for complex crawling and extraction tasks -- 💡 Get tailored support and examples -- 📘 Learn Crawl4AI faster with step-by-step guidance - -## New in 0.3.73 ✨ - -- 🐳 Docker Ready: Full API server with seamless deployment & scaling -- 🎯 Browser Takeover: Use your own browser with cookies & history intact (CDP support) -- 📝 Mockdown+: Enhanced tag preservation & content extraction -- ⚡️ Parallel Power: Supercharged multi-URL crawling performance -- 🌟 And many more exciting updates... ## Try it Now! diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index d554576d..d22e3b1f 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -10,14 +10,14 @@ import asyncio from .models import CrawlResult from .async_database import async_db_manager from .chunking_strategy import * +from .content_filter_strategy import * from .extraction_strategy import * from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode from .content_scrapping_strategy import WebScrapingStrategy from .config import ( MIN_WORD_THRESHOLD, - IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, - SHOW_DEPRECATION_WARNINGS # New import + IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD ) from .utils import ( sanitize_input_encode, @@ -49,7 +49,7 @@ class AsyncWebCrawler: ) To disable deprecation warnings: - Set SHOW_DEPRECATION_WARNINGS = False in config.py + Pass warning=False to suppress the warning. """ def __init__( @@ -85,11 +85,11 @@ class AsyncWebCrawler: # Handle deprecated parameter if always_by_pass_cache is not None: - if SHOW_DEPRECATION_WARNINGS: + if kwargs.get("warning", True): warnings.warn( "'always_by_pass_cache' is deprecated and will be removed in version X.X.X. " "Use 'always_bypass_cache' instead. " - "Set SHOW_DEPRECATION_WARNINGS=False in config.py to suppress this warning.", + "Pass warning=False to suppress this warning.", DeprecationWarning, stacklevel=2 ) @@ -126,6 +126,7 @@ class AsyncWebCrawler: word_count_threshold=MIN_WORD_THRESHOLD, extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), + content_filter: RelevantContentFilter = None, cache_mode: Optional[CacheMode] = None, # Deprecated parameters bypass_cache: bool = False, @@ -172,7 +173,7 @@ class AsyncWebCrawler: try: # Handle deprecated parameters if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): - if SHOW_DEPRECATION_WARNINGS: + if kwargs.get("warning", True): warnings.warn( "Cache control boolean flags are deprecated and will be removed in version X.X.X. " "Use 'cache_mode' parameter instead. Examples:\n" @@ -180,7 +181,7 @@ class AsyncWebCrawler: "- For disable_cache=True, use cache_mode=CacheMode.DISABLED\n" "- For no_cache_read=True, use cache_mode=CacheMode.WRITE_ONLY\n" "- For no_cache_write=True, use cache_mode=CacheMode.READ_ONLY\n" - "Set SHOW_DEPRECATION_WARNINGS=False in config.py to suppress this warning.", + "Pass warning=False to suppress this warning.", DeprecationWarning, stacklevel=2 ) @@ -257,6 +258,7 @@ class AsyncWebCrawler: word_count_threshold=word_count_threshold, extraction_strategy=extraction_strategy, chunking_strategy=chunking_strategy, + content_filter=content_filter, css_selector=css_selector, screenshot=screenshot_data, verbose=verbose, @@ -308,6 +310,7 @@ class AsyncWebCrawler: word_count_threshold=MIN_WORD_THRESHOLD, extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), + content_filter: RelevantContentFilter = None, cache_mode: Optional[CacheMode] = None, # Deprecated parameters bypass_cache: bool = False, @@ -335,14 +338,15 @@ class AsyncWebCrawler: Returns: List[CrawlResult]: Results for each URL """ - if bypass_cache and SHOW_DEPRECATION_WARNINGS: - warnings.warn( - "'bypass_cache' is deprecated and will be removed in version X.X.X. " - "Use 'cache_mode=CacheMode.BYPASS' instead. " - "Set SHOW_DEPRECATION_WARNINGS=False in config.py to suppress this warning.", - DeprecationWarning, - stacklevel=2 - ) + if bypass_cache: + if kwargs.get("warning", True): + warnings.warn( + "'bypass_cache' is deprecated and will be removed in version X.X.X. " + "Use 'cache_mode=CacheMode.BYPASS' instead. " + "Pass warning=False to suppress this warning.", + DeprecationWarning, + stacklevel=2 + ) if cache_mode is None: cache_mode = CacheMode.BYPASS @@ -356,6 +360,7 @@ class AsyncWebCrawler: word_count_threshold=word_count_threshold, extraction_strategy=extraction_strategy, chunking_strategy=chunking_strategy, + content_filter=content_filter, cache_mode=cache_mode, css_selector=css_selector, screenshot=screenshot, @@ -377,6 +382,7 @@ class AsyncWebCrawler: word_count_threshold: int, extraction_strategy: ExtractionStrategy, chunking_strategy: ChunkingStrategy, + content_filter: RelevantContentFilter, css_selector: str, screenshot: str, verbose: bool, @@ -397,6 +403,7 @@ class AsyncWebCrawler: image_description_min_word_threshold=kwargs.get( "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD ), + content_filter = content_filter, **kwargs, ) diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index 9c81638c..d16b0680 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -532,14 +532,13 @@ class WebScrapingStrategy(ContentScrapingStrategy): fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content." fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content." - if kwargs.get('fit_markdown', False): - # cleaner = ContentCleaningStrategy() - # fit_html = cleaner.clean(cleaned_html) - # fit_markdown = h.handle(fit_html) - content_filter = BM25ContentFilter( - user_query= kwargs.get('fit_markdown_user_query', None), - bm25_threshold= kwargs.get('fit_markdown_bm25_threshold', 1.0) - ) + if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False): + content_filter = kwargs.get('content_filter', None) + if not content_filter: + content_filter = BM25ContentFilter( + user_query= kwargs.get('fit_markdown_user_query', None), + bm25_threshold= kwargs.get('fit_markdown_bm25_threshold', 1.0) + ) fit_html = content_filter.filter_content(html) fit_html = '\n'.join('
    {}
    '.format(s) for s in fit_html) fit_markdown = h.handle(fit_html) diff --git a/docs/examples/v0.3.74.overview.py b/docs/examples/v0.3.74.overview.py index 579d05dd..ec3a7d73 100644 --- a/docs/examples/v0.3.74.overview.py +++ b/docs/examples/v0.3.74.overview.py @@ -56,7 +56,7 @@ async def content_filtering_example(): result = await crawler.arun( url="https://openai.com/blog", - extraction_strategy=content_filter + content_filter=content_filter ) print(f"Filtered content: {result.extracted_content}") diff --git a/docs/md_v2/advanced/managed_browser.md b/docs/md_v2/advanced/managed_browser.md new file mode 100644 index 00000000..80d6fc1a --- /dev/null +++ b/docs/md_v2/advanced/managed_browser.md @@ -0,0 +1,84 @@ +# Content Filtering in Crawl4AI + +This guide explains how to use content filtering strategies in Crawl4AI to extract the most relevant information from crawled web pages. You'll learn how to use the built-in `BM25ContentFilter` and how to create your own custom content filtering strategies. + +## Relevance Content Filter + +The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks. + +## BM25 Algorithm + +The `BM25ContentFilter` uses the BM25 algorithm, a ranking function used in information retrieval to estimate the relevance of documents to a given search query. In Crawl4AI, this algorithm helps to identify and extract text chunks that are most relevant to the page's metadata or a user-specified query. + +### Usage + +To use the `BM25ContentFilter`, initialize it and then pass it as the `extraction_strategy` parameter to the `arun` method of the crawler. + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.content_filter_strategy import BM25ContentFilter + +async def filter_content(url, query=None): + async with AsyncWebCrawler() as crawler: + content_filter = BM25ContentFilter(user_query=query) + result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) # Set fit_markdown flag to True to trigger BM25 filtering + if result.success: + print(f"Filtered Content (JSON):\n{result.extracted_content}") + print(f"\nFiltered Markdown:\n{result.fit_markdown}") # New field in CrawlResult object + print(f"\nFiltered HTML:\n{result.fit_html}") # New field in CrawlResult object. Note that raw HTML may have tags re-organized due to internal parsing. + else: + print("Error:", result.error_message) + +# Example usage: +asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple", "fruit nutrition health")) # with query +asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple")) # without query, metadata will be used as the query. + +``` + +### Parameters + +- **`user_query`**: (Optional) A string representing the search query. If not provided, the filter extracts relevant metadata (title, description, keywords) from the page and uses that as the query. +- **`bm25_threshold`**: (Optional, default 1.0) A float value that controls the threshold for relevance. Higher values result in stricter filtering, returning only the most relevant text chunks. Lower values result in more lenient filtering. + + +## Fit Markdown Flag + +Setting the `fit_markdown` flag to `True` in the `arun` method activates the BM25 content filtering during the crawl. The `fit_markdown` parameter instructs the scraper to extract and clean the HTML, primarily to prepare for a Large Language Model that cannot process large amounts of data. Setting this flag not only improves the quality of the extracted content but also adds the filtered content to two new attributes in the returned `CrawlResult` object: `fit_markdown` and `fit_html`. + + +## Custom Content Filtering Strategies + +You can create your own custom filtering strategies by inheriting from the `RelevantContentFilter` class and implementing the `filter_content` method. This allows you to tailor the filtering logic to your specific needs. + +```python +from crawl4ai.content_filter_strategy import RelevantContentFilter +from bs4 import BeautifulSoup, Tag +from typing import List + +class MyCustomFilter(RelevantContentFilter): + def filter_content(self, html: str) -> List[str]: + soup = BeautifulSoup(html, 'lxml') + # Implement custom filtering logic here + # Example: extract all paragraphs within divs with class "article-body" + filtered_paragraphs = [] + for tag in soup.select("div.article-body p"): + if isinstance(tag, Tag): + filtered_paragraphs.append(str(tag)) # Add the cleaned HTML element. + return filtered_paragraphs + + + +async def custom_filter_demo(url: str): + async with AsyncWebCrawler() as crawler: + custom_filter = MyCustomFilter() + result = await crawler.arun(url, extraction_strategy=custom_filter) + if result.success: + print(result.extracted_content) + +``` + +This example demonstrates extracting paragraphs from a specific div class. You can customize this logic to implement different filtering strategies, use regular expressions, analyze text density, or apply other relevant techniques. + +## Conclusion + +Content filtering strategies provide a powerful way to refine the output of your crawls. By using `BM25ContentFilter` or creating custom strategies, you can focus on the most pertinent information and improve the efficiency of your data processing pipeline. diff --git a/docs/md_v2/advanced/session-management-advanced.md b/docs/md_v2/advanced/session-management-advanced.md index f8c81da2..908828f7 100644 --- a/docs/md_v2/advanced/session-management-advanced.md +++ b/docs/md_v2/advanced/session-management-advanced.md @@ -30,7 +30,7 @@ Let's start with a basic example of session-based crawling: ```python import asyncio -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode async def basic_session_crawl(): async with AsyncWebCrawler(verbose=True) as crawler: @@ -43,7 +43,7 @@ async def basic_session_crawl(): session_id=session_id, js_code="document.querySelector('.load-more-button').click();" if page > 0 else None, css_selector=".content-item", - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(f"Page {page + 1}: Found {result.extracted_content.count('.content-item')} items") @@ -102,7 +102,7 @@ async def advanced_session_crawl_with_hooks(): session_id=session_id, css_selector="li.commit-item", js_code=js_next_page if page > 0 else None, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, js_only=page > 0 ) @@ -174,7 +174,7 @@ async def integrated_js_and_wait_crawl(): extraction_strategy=extraction_strategy, js_code=js_next_page_and_wait if page > 0 else None, js_only=page > 0, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) commits = json.loads(result.extracted_content) @@ -241,7 +241,7 @@ async def wait_for_parameter_crawl(): js_code=js_next_page if page > 0 else None, wait_for=wait_for if page > 0 else None, js_only=page > 0, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) commits = json.loads(result.extracted_content) diff --git a/docs/md_v2/advanced/session-management.md b/docs/md_v2/advanced/session-management.md index c38ed852..eae4cf7b 100644 --- a/docs/md_v2/advanced/session-management.md +++ b/docs/md_v2/advanced/session-management.md @@ -75,7 +75,7 @@ async def crawl_dynamic_content(): js_code=js_next_page if page > 0 else None, wait_for=wait_for if page > 0 else None, js_only=page > 0, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) if result.success: diff --git a/docs/md_v2/api/arun.md b/docs/md_v2/api/arun.md index 9ef73aef..509991e5 100644 --- a/docs/md_v2/api/arun.md +++ b/docs/md_v2/api/arun.md @@ -8,11 +8,26 @@ The following parameters can be passed to the `arun()` method. They are organize await crawler.arun( url="https://example.com", # Required: URL to crawl verbose=True, # Enable detailed logging - bypass_cache=False, # Skip cache for this request + cache_mode=CacheMode.ENABLED, # Control cache behavior warmup=True # Whether to run warmup check ) ``` +## Cache Control + +```python +from crawl4ai import CacheMode + +await crawler.arun( + cache_mode=CacheMode.ENABLED, # Normal caching (read/write) + # Other cache modes: + # cache_mode=CacheMode.DISABLED # No caching at all + # cache_mode=CacheMode.READ_ONLY # Only read from cache + # cache_mode=CacheMode.WRITE_ONLY # Only write to cache + # cache_mode=CacheMode.BYPASS # Skip cache for this operation +) +``` + ## Content Processing Parameters ### Text Processing @@ -162,14 +177,13 @@ await crawler.arun( ## Parameter Interactions and Notes -1. **Magic Mode Combinations** +1. **Cache and Performance Setup** ```python - # Full anti-detection setup + # Optimal caching for repeated crawls await crawler.arun( - magic=True, - headless=False, - simulate_user=True, - override_navigator=True + cache_mode=CacheMode.ENABLED, + word_count_threshold=10, + process_iframes=False ) ``` @@ -179,7 +193,8 @@ await crawler.arun( await crawler.arun( js_code="window.scrollTo(0, document.body.scrollHeight);", wait_for="css:.lazy-content", - delay_before_return_html=2.0 + delay_before_return_html=2.0, + cache_mode=CacheMode.WRITE_ONLY # Cache results after dynamic load ) ``` @@ -192,7 +207,8 @@ await crawler.arun( extraction_strategy=my_strategy, chunking_strategy=my_chunking, process_iframes=True, - remove_overlay_elements=True + remove_overlay_elements=True, + cache_mode=CacheMode.ENABLED ) ``` @@ -201,7 +217,7 @@ await crawler.arun( 1. **Performance Optimization** ```python await crawler.arun( - bypass_cache=False, # Use cache when possible + cache_mode=CacheMode.ENABLED, # Use full caching word_count_threshold=10, # Filter out noise process_iframes=False # Skip iframes if not needed ) @@ -212,7 +228,8 @@ await crawler.arun( await crawler.arun( magic=True, # Enable anti-detection delay_before_return_html=1.0, # Wait for dynamic content - page_timeout=60000 # Longer timeout for slow pages + page_timeout=60000, # Longer timeout for slow pages + cache_mode=CacheMode.WRITE_ONLY # Cache results after successful crawl ) ``` @@ -221,6 +238,7 @@ await crawler.arun( await crawler.arun( remove_overlay_elements=True, # Remove popups excluded_tags=['nav', 'aside'],# Remove unnecessary elements - keep_data_attributes=False # Remove data attributes + keep_data_attributes=False, # Remove data attributes + cache_mode=CacheMode.ENABLED # Use cache for faster processing ) ``` \ No newline at end of file diff --git a/docs/md_v2/api/crawl-result.md b/docs/md_v2/api/crawl-result.md index 06998af3..7e3bda98 100644 --- a/docs/md_v2/api/crawl-result.md +++ b/docs/md_v2/api/crawl-result.md @@ -20,6 +20,7 @@ class CrawlResult(BaseModel): fit_html: Optional[str] = None # Most relevant HTML content markdown: Optional[str] = None # HTML converted to markdown fit_markdown: Optional[str] = None # Most relevant markdown content + downloaded_files: Optional[List[str]] = None # Downloaded files # Extracted Data extracted_content: Optional[str] = None # Content from extraction strategy diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md index 6c7960d2..c1c4d2ea 100644 --- a/docs/md_v2/api/parameters.md +++ b/docs/md_v2/api/parameters.md @@ -32,4 +32,5 @@ | async_webcrawler.py | warmup | `kwargs.get("warmup", True)` | AsyncWebCrawler | Initialize crawler with warmup request | | async_webcrawler.py | session_id | `kwargs.get("session_id", None)` | AsyncWebCrawler | Session identifier for browser reuse | | async_webcrawler.py | only_text | `kwargs.get("only_text", False)` | AsyncWebCrawler | Extract only text content | -| async_webcrawler.py | bypass_cache | `kwargs.get("bypass_cache", False)` | AsyncWebCrawler | Skip cache and force fresh crawl | \ No newline at end of file +| async_webcrawler.py | bypass_cache | `kwargs.get("bypass_cache", False)` | AsyncWebCrawler | Skip cache and force fresh crawl | +| async_webcrawler.py | cache_mode | `kwargs.get("cache_mode", CacheMode.ENABLE)` | AsyncWebCrawler | Cache handling mode for request | \ No newline at end of file diff --git a/docs/md_v2/basic/content_filtering.md b/docs/md_v2/basic/content_filtering.md new file mode 100644 index 00000000..9506c075 --- /dev/null +++ b/docs/md_v2/basic/content_filtering.md @@ -0,0 +1,84 @@ +# Content Filtering in Crawl4AI + +This guide explains how to use content filtering strategies in Crawl4AI to extract the most relevant information from crawled web pages. You'll learn how to use the built-in `BM25ContentFilter` and how to create your own custom content filtering strategies. + +## Relevance Content Filter + +The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks. + +## BM25 Algorithm + +The `BM25ContentFilter` uses the BM25 algorithm, a ranking function used in information retrieval to estimate the relevance of documents to a given search query. In Crawl4AI, this algorithm helps to identify and extract text chunks that are most relevant to the page's metadata or a user-specified query. + +### Usage + +To use the `BM25ContentFilter`, initialize it and then pass it as the `extraction_strategy` parameter to the `arun` method of the crawler. + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.content_filter_strategy import BM25ContentFilter + +async def filter_content(url, query=None): + async with AsyncWebCrawler() as crawler: + content_filter = BM25ContentFilter(user_query=query) + result = await crawler.arun(url=url, content_filter=content_filter, fit_markdown=True) # Set fit_markdown flag to True to trigger BM25 filtering + if result.success: + print(f"Filtered Content (JSON):\n{result.extracted_content}") + print(f"\nFiltered Markdown:\n{result.fit_markdown}") # New field in CrawlResult object + print(f"\nFiltered HTML:\n{result.fit_html}") # New field in CrawlResult object. Note that raw HTML may have tags re-organized due to internal parsing. + else: + print("Error:", result.error_message) + +# Example usage: +asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple", "fruit nutrition health")) # with query +asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple")) # without query, metadata will be used as the query. + +``` + +### Parameters + +- **`user_query`**: (Optional) A string representing the search query. If not provided, the filter extracts relevant metadata (title, description, keywords) from the page and uses that as the query. +- **`bm25_threshold`**: (Optional, default 1.0) A float value that controls the threshold for relevance. Higher values result in stricter filtering, returning only the most relevant text chunks. Lower values result in more lenient filtering. + + +## Fit Markdown Flag + +Setting the `fit_markdown` flag to `True` in the `arun` method activates the BM25 content filtering during the crawl. The `fit_markdown` parameter instructs the scraper to extract and clean the HTML, primarily to prepare for a Large Language Model that cannot process large amounts of data. Setting this flag not only improves the quality of the extracted content but also adds the filtered content to two new attributes in the returned `CrawlResult` object: `fit_markdown` and `fit_html`. + + +## Custom Content Filtering Strategies + +You can create your own custom filtering strategies by inheriting from the `RelevantContentFilter` class and implementing the `filter_content` method. This allows you to tailor the filtering logic to your specific needs. + +```python +from crawl4ai.content_filter_strategy import RelevantContentFilter +from bs4 import BeautifulSoup, Tag +from typing import List + +class MyCustomFilter(RelevantContentFilter): + def filter_content(self, html: str) -> List[str]: + soup = BeautifulSoup(html, 'lxml') + # Implement custom filtering logic here + # Example: extract all paragraphs within divs with class "article-body" + filtered_paragraphs = [] + for tag in soup.select("div.article-body p"): + if isinstance(tag, Tag): + filtered_paragraphs.append(str(tag)) # Add the cleaned HTML element. + return filtered_paragraphs + + + +async def custom_filter_demo(url: str): + async with AsyncWebCrawler() as crawler: + custom_filter = MyCustomFilter() + result = await crawler.arun(url, content_filter=custom_filter) + if result.success: + print(result.extracted_content) + +``` + +This example demonstrates extracting paragraphs from a specific div class. You can customize this logic to implement different filtering strategies, use regular expressions, analyze text density, or apply other relevant techniques. + +## Conclusion + +Content filtering strategies provide a powerful way to refine the output of your crawls. By using `BM25ContentFilter` or creating custom strategies, you can focus on the most pertinent information and improve the efficiency of your data processing pipeline. diff --git a/docs/md_v2/basic/file-download.md b/docs/md_v2/basic/file-download.md new file mode 100644 index 00000000..c37e8812 --- /dev/null +++ b/docs/md_v2/basic/file-download.md @@ -0,0 +1,148 @@ +# Download Handling in Crawl4AI + +This guide explains how to use Crawl4AI to handle file downloads during crawling. You'll learn how to trigger downloads, specify download locations, and access downloaded files. + +## Enabling Downloads + +By default, Crawl4AI does not download files. To enable downloads, set the `accept_downloads` parameter to `True` in either the `AsyncWebCrawler` constructor or the `arun` method. + +```python +from crawl4ai import AsyncWebCrawler + +async def main(): + async with AsyncWebCrawler(accept_downloads=True) as crawler: # Globally enable downloads + # ... your crawling logic ... + +asyncio.run(main()) +``` + +Or, enable it for a specific crawl: + +```python +async def main(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="...", accept_downloads=True) + # ... +``` + +## Specifying Download Location + +You can specify the download directory using the `downloads_path` parameter. If not provided, Crawl4AI creates a "downloads" directory inside the `.crawl4ai` folder in your home directory. + +```python +import os +from pathlib import Path + +# ... inside your crawl function: + +downloads_path = os.path.join(os.getcwd(), "my_downloads") # Custom download path +os.makedirs(downloads_path, exist_ok=True) + +result = await crawler.arun(url="...", downloads_path=downloads_path, accept_downloads=True) + +# ... +``` + +If you are setting it globally, provide the path to the AsyncWebCrawler: +```python +async def crawl_with_downloads(url: str, download_path: str): + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=download_path, # or set it on arun + verbose=True + ) as crawler: + result = await crawler.arun(url=url) # you still need to enable downloads per call. + # ... +``` + + + +## Triggering Downloads + +Downloads are typically triggered by user interactions on a web page (e.g., clicking a download button). You can simulate these actions with the `js_code` parameter, injecting JavaScript code to be executed within the browser context. The `wait_for` parameter might also be crucial to allowing sufficient time for downloads to initiate before the crawler proceeds. + +```python +result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + // Find and click the first Windows installer link + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) { + downloadLink.click(); + } + """, + wait_for=5 # Wait for 5 seconds for the download to start +) +``` + +## Accessing Downloaded Files + +Downloaded file paths are stored in the `downloaded_files` attribute of the returned `CrawlResult` object. This is a list of strings, with each string representing the absolute path to a downloaded file. + +```python +if result.downloaded_files: + print("Downloaded files:") + for file_path in result.downloaded_files: + print(f"- {file_path}") + # Perform operations with downloaded files, e.g., check file size + file_size = os.path.getsize(file_path) + print(f"- File size: {file_size} bytes") +else: + print("No files downloaded.") +``` + + +## Example: Downloading Multiple Files + +```python +import asyncio +import os +from pathlib import Path +from crawl4ai import AsyncWebCrawler + +async def download_multiple_files(url: str, download_path: str): + + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=download_path, + verbose=True + ) as crawler: + result = await crawler.arun( + url=url, + js_code=""" + // Trigger multiple downloads (example) + const downloadLinks = document.querySelectorAll('a[download]'); // Or a more specific selector + for (const link of downloadLinks) { + link.click(); + await new Promise(r => setTimeout(r, 2000)); // Add a small delay between clicks if needed + } + """, + wait_for=10 # Adjust the timeout to match the expected time for all downloads to start + ) + + if result.downloaded_files: + print("Downloaded files:") + for file in result.downloaded_files: + print(f"- {file}") + else: + print("No files downloaded.") + + +# Example usage +download_path = os.path.join(Path.home(), ".crawl4ai", "downloads") +os.makedirs(download_path, exist_ok=True) # Create directory if it doesn't exist + + +asyncio.run(download_multiple_files("https://www.python.org/downloads/windows/", download_path)) +``` + +## Important Considerations + +- **Browser Context:** Downloads are managed within the browser context. Ensure your `js_code` correctly targets the download triggers on the specific web page. +- **Waiting:** Use `wait_for` to manage the timing of the crawl process if immediate download might not occur. +- **Error Handling:** Implement proper error handling to gracefully manage failed downloads or incorrect file paths. +- **Security:** Downloaded files should be scanned for potential security threats before use. + + + +This guide provides a foundation for handling downloads with Crawl4AI. You can adapt these techniques to manage downloads in various scenarios and integrate them into more complex crawling workflows. diff --git a/docs/md_v2/basic/quickstart.md b/docs/md_v2/basic/quickstart.md index f4904915..95b8a397 100644 --- a/docs/md_v2/basic/quickstart.md +++ b/docs/md_v2/basic/quickstart.md @@ -8,7 +8,7 @@ First, let's import the necessary modules and create an instance of `AsyncWebCra ```python import asyncio -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CasheMode async def main(): async with AsyncWebCrawler(verbose=True) as crawler: @@ -42,7 +42,7 @@ async def capture_and_save_screenshot(url: str, output_path: str): result = await crawler.arun( url=url, screenshot=True, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) if result.success and result.screenshot: @@ -62,15 +62,15 @@ Crawl4AI supports multiple browser engines. Here's how to use different browsers ```python # Use Firefox async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless=True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode=CacheMode.BYPASS) # Use WebKit async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless=True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode=CacheMode.BYPASS) # Use Chromium (default) async with AsyncWebCrawler(verbose=True, headless=True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode=CacheMode.BYPASS) ``` ### User Simulation 🎭 @@ -81,7 +81,7 @@ Simulate real user behavior to avoid detection: async with AsyncWebCrawler(verbose=True, headless=True) as crawler: result = await crawler.arun( url="YOUR-URL-HERE", - bypass_cache=True, + cache_mode=CacheMode.BYPASS, simulate_user=True, # Causes random mouse movements and clicks override_navigator=True # Makes the browser appear more like a real user ) @@ -99,7 +99,7 @@ async def main(): print(f"First crawl result: {result1.markdown[:100]}...") # Force to crawl again - result2 = await crawler.arun(url="https://www.nbcnews.com/business", bypass_cache=True) + result2 = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS) print(f"Second crawl result: {result2.markdown[:100]}...") asyncio.run(main()) @@ -189,7 +189,7 @@ extraction_strategy = LLMExtractionStrategy( async with AsyncWebCrawler() as crawler: result = await crawler.arun( url="https://paulgraham.com/love.html", - bypass_cache=True, + cache_mode=CacheMode.BYPASS, extraction_strategy=extraction_strategy ) ``` @@ -239,7 +239,7 @@ async def crawl_dynamic_content(): js_code=js_next_page if page > 0 else None, wait_for=wait_for if page > 0 else None, js_only=page > 0, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, headless=False, ) @@ -254,7 +254,7 @@ Remove overlay elements and fit content appropriately: async with AsyncWebCrawler(headless=False) as crawler: result = await crawler.arun( url="your-url-here", - bypass_cache=True, + cache_mode=CacheMode.BYPASS, word_count_threshold=10, remove_overlay_elements=True, screenshot=True @@ -282,7 +282,7 @@ async with AsyncWebCrawler() as crawler: result = await crawler.arun( url="https://www.nbcnews.com/business", word_count_threshold=0, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, verbose=False, ) end = time.time() diff --git a/docs/md_v2/basic/simple-crawling.md b/docs/md_v2/basic/simple-crawling.md index 097d5e61..871fa64c 100644 --- a/docs/md_v2/basic/simple-crawling.md +++ b/docs/md_v2/basic/simple-crawling.md @@ -12,7 +12,9 @@ from crawl4ai import AsyncWebCrawler async def main(): async with AsyncWebCrawler() as crawler: - result = await crawler.arun(url="https://example.com") + result = await crawler.arun( + url="https://example.com" + ) print(result.markdown) # Print clean markdown content if __name__ == "__main__": @@ -24,7 +26,7 @@ if __name__ == "__main__": The `arun()` method returns a `CrawlResult` object with several useful properties. Here's a quick overview (see [CrawlResult](../api/crawl-result.md) for complete details): ```python -result = await crawler.arun(url="https://example.com") +result = await crawler.arun(url="https://example.com", fit_markdown=True) # Different content formats print(result.html) # Raw HTML @@ -81,7 +83,7 @@ Here's a more comprehensive example showing common usage patterns: ```python import asyncio -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode async def main(): async with AsyncWebCrawler(verbose=True) as crawler: @@ -97,7 +99,7 @@ async def main(): remove_overlay_elements=True, # Cache control - bypass_cache=False # Use cache if available + cache_mode=CacheMode.ENABLE # Use cache if available ) if result.success: diff --git a/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md b/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md index 3682425f..a9f00e92 100644 --- a/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md +++ b/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md @@ -52,7 +52,7 @@ Here’s a comprehensive outline for the **LLM Extraction Strategy** video, cove extraction_type="schema", instruction="Extract model names and fees for input and output tokens from the page." ), - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` @@ -98,7 +98,7 @@ Here’s a comprehensive outline for the **LLM Extraction Strategy** video, cove result = await crawler.arun( url="https://example.com/some-article", extraction_strategy=extraction_strategy, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` diff --git a/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md b/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md index 9f1c00ea..6100ae4c 100644 --- a/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md +++ b/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md @@ -55,7 +55,7 @@ Here’s a structured outline for the **Cosine Similarity Strategy** video, cove result = await crawler.arun( url=url, extraction_strategy=extraction_strategy, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` @@ -103,7 +103,7 @@ Here’s a structured outline for the **Cosine Similarity Strategy** video, cove result = await crawler.arun( url=url, extraction_strategy=extraction_strategy, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` diff --git a/docs/md_v2/tutorial/tutorial.md b/docs/md_v2/tutorial/tutorial.md index bf355ed0..7bead842 100644 --- a/docs/md_v2/tutorial/tutorial.md +++ b/docs/md_v2/tutorial/tutorial.md @@ -26,7 +26,7 @@ Here's a condensed outline of the **Installation and Setup** video content: - Walk through a simple test script to confirm the setup: ```python import asyncio - from crawl4ai import AsyncWebCrawler + from crawl4ai import AsyncWebCrawler, CacheMode async def main(): async with AsyncWebCrawler(verbose=True) as crawler: @@ -1093,7 +1093,7 @@ Here’s a comprehensive outline for the **LLM Extraction Strategy** video, cove extraction_type="schema", instruction="Extract model names and fees for input and output tokens from the page." ), - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` @@ -1139,7 +1139,7 @@ Here’s a comprehensive outline for the **LLM Extraction Strategy** video, cove result = await crawler.arun( url="https://example.com/some-article", extraction_strategy=extraction_strategy, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` @@ -1248,7 +1248,7 @@ Here’s a structured outline for the **Cosine Similarity Strategy** video, cove result = await crawler.arun( url=url, extraction_strategy=extraction_strategy, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` @@ -1296,7 +1296,7 @@ Here’s a structured outline for the **Cosine Similarity Strategy** video, cove result = await crawler.arun( url=url, extraction_strategy=extraction_strategy, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` From 152ac35bc2805610863d1f13efe8434fe2d290bd Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 17 Nov 2024 21:09:26 +0800 Subject: [PATCH 044/115] feat(docs): update README for version 0.3.74 with new features and improvements fix(version): update version number to 0.3.74 refactor(async_webcrawler): enhance logging and add domain-based request delay --- README.md | 16 +++++------ crawl4ai/__version__.py | 2 +- crawl4ai/async_crawler_strategy.py | 4 +-- crawl4ai/async_webcrawler.py | 43 +++++++++++++++++++++++++----- crawl4ai/config.py | 1 + 5 files changed, 47 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 9c3796cd..f6c8dc08 100644 --- a/README.md +++ b/README.md @@ -13,17 +13,15 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc ## New in 0.3.74 ✨ -- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster, often completing in under 100 milliseconds (excluding web fetch time)! -- 📥 **Download Mastery:** Control downloads, specify folders, and track files within the `CrawlResult` object. -- 🔎 **Relevance Filtering:** Extract the most important content with the new `RelevanceContentFilter` and BM25 algorithm. Control filtering with the `fit_markdown` flag. +- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster, often completing in under 100 milliseconds (excluding web fetch time)! +- 📥 **Download Manager:** Integrated file crawling and downloading capabilities, with full control over file management and tracking within the `CrawlResult` object. +- 🔎 **Markdown Filter:** Enhanced content extraction using BM25 algorithm to create cleaner markdown with only relevant webpage content. - 🗂️ **Local & Raw HTML:** Crawl local files (`file://`) and raw HTML strings (`raw:`) directly. -- 🤖 **Browser Boss:** Manage browser sessions with persistent contexts, process monitoring, and tf-playwright-stealth integration. Configure using `use_managed_browser`, `user_data_dir`, and `use_persistent_context` parameters. +- 🤖 **Browser Control:** Use your own browser setup for crawling, with persistent contexts and stealth integration to bypass anti-bot measures. - ☁️ **API & Cache Boost:** CORS support, static file serving, and a new filesystem-based cache for blazing-fast performance. Fine-tune caching with the `CacheMode` enum (ENABLED, DISABLED, READ_ONLY, WRITE_ONLY, BYPASS) and the `always_bypass_cache` parameter. -- 🔒 **API Security:** Protect your API server with token-based authentication using the `CRAWL4AI_API_TOKEN` environment variable. -- 🔄 **Synchronous & Direct Crawling:** Get immediate results with `/crawl_sync` or bypass the task queue with `/crawl_direct`. -- 🛠️ **Database Migration:** A new `crawl4ai-migrate` command ensures smooth upgrades and data integrity between versions. -- 🐛 **Squashed Bugs:** Fixed browser context issues in Docker, memory leaks, enhanced error handling, and improved HTML parsing. - +- 🐳 **API Gateway:** Run Crawl4AI as a local or cloud API service, enabling cross-platform usage through a containerized server with secure token authentication via `CRAWL4AI_API_TOKEN`. +- 🛠️ **Database Improvements:** Enhanced database system for handling larger content sets with improved caching and faster performance. +- 🐛 **Squashed Bugs:** Fixed browser context issues in Docker, memory leaks, enhanced error handling, and improved HTML parsing. ## Try it Now! diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 7ab71c9b..65ee6e73 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.731" \ No newline at end of file +__version__ = "0.3.74" \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index a67591af..90d5cbe8 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -605,7 +605,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): proxy={"server": self.proxy} if self.proxy else None, java_script_enabled=True, accept_downloads=self.accept_downloads, - downloads_path=self.downloads_path if self.accept_downloads else None + # downloads_path=self.downloads_path if self.accept_downloads else None ) await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) await context.set_extra_http_headers(self.headers) @@ -905,7 +905,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): ) return response except Error as e: - raise Error(f"[ERROR] 🚫 crawl(): Failed to crawl {url}: {str(e)}") + raise Error(f"async_crawler_strategy.py:_crawleb(): {str(e)}") # finally: # if not session_id: # await page.close() diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index d22e3b1f..79a17ac4 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -15,15 +15,19 @@ from .extraction_strategy import * from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode from .content_scrapping_strategy import WebScrapingStrategy + from .config import ( MIN_WORD_THRESHOLD, - IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD + IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, + URL_LOG_SHORTEN_LENGTH ) from .utils import ( sanitize_input_encode, InvalidCSSSelectorError, format_html ) +from urllib.parse import urlparse +import random from .__version__ import __version__ as crawl4ai_version @@ -51,6 +55,7 @@ class AsyncWebCrawler: To disable deprecation warnings: Pass warning=False to suppress the warning. """ + _domain_last_hit = {} def __init__( self, @@ -248,7 +253,7 @@ class AsyncWebCrawler: screenshot_data = async_response.screenshot t2 = time.perf_counter() if verbose: - print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") + print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") # Process the HTML content crawl_result = await self.aprocess_html( @@ -283,7 +288,7 @@ class AsyncWebCrawler: crawl_result.session_id = kwargs.get("session_id", None) if verbose: - print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url} | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") + print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") # Update cache if appropriate @@ -295,7 +300,7 @@ class AsyncWebCrawler: except Exception as e: if not hasattr(e, "msg"): e.msg = str(e) - print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url} | {e.msg}{Style.RESET_ALL}") + print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") return CrawlResult( url=url, html="", @@ -350,10 +355,29 @@ class AsyncWebCrawler: if cache_mode is None: cache_mode = CacheMode.BYPASS - semaphore_count = kwargs.get('semaphore_count', 5) + semaphore_count = kwargs.get('semaphore_count', 10) semaphore = asyncio.Semaphore(semaphore_count) async def crawl_with_semaphore(url): + domain = urlparse(url).netloc + current_time = time.time() + + print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}") + + # Get delay settings from kwargs or use defaults + mean_delay = kwargs.get('mean_delay', 0.1) # 0.5 seconds default mean delay + max_range = kwargs.get('max_range', 0.3) # 1 seconds default max additional delay + + # Check if we need to wait + if domain in self._domain_last_hit: + time_since_last = current_time - self._domain_last_hit[domain] + if time_since_last < mean_delay: + delay = mean_delay + random.uniform(0, max_range) + await asyncio.sleep(delay) + + # Update last hit time + self._domain_last_hit[domain] = current_time + async with semaphore: return await self.arun( url, @@ -369,8 +393,13 @@ class AsyncWebCrawler: **kwargs, ) + # Print start message + print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}") + start_time = time.perf_counter() tasks = [crawl_with_semaphore(url) for url in urls] results = await asyncio.gather(*tasks, return_exceptions=True) + end_time = time.perf_counter() + print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}") return [result if not isinstance(result, Exception) else str(result) for result in results] @@ -423,7 +452,7 @@ class AsyncWebCrawler: metadata = result.get("metadata", {}) if verbose: - print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url}{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms") + print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms") @@ -439,7 +468,7 @@ class AsyncWebCrawler: extracted_content = extraction_strategy.run(url, sections) extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) if verbose: - print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url}{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}") + print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}") diff --git a/crawl4ai/config.py b/crawl4ai/config.py index 6b1324dd..786ca4e5 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -55,4 +55,5 @@ IMAGE_SCORE_THRESHOLD = 2 MAX_METRICS_HISTORY = 1000 NEED_MIGRATION = True +URL_LOG_SHORTEN_LENGTH = 30 SHOW_DEPRECATION_WARNINGS = True \ No newline at end of file From 852729ff380f0568d6874bc960606ba3cce0e935 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 18 Nov 2024 21:00:06 +0800 Subject: [PATCH 045/115] feat(docker): add Docker Compose configurations for local and hub deployment; enhance GPU support checks in Dockerfile feat(requirements): update requirements.txt to include snowballstemmer fix(version_manager): correct version parsing to use __version__.__version__ feat(main): introduce chunking strategy and content filter in CrawlRequest model feat(content_filter): enhance BM25 algorithm with priority tag scoring for improved content relevance feat(logger): implement new async logger engine replacing print statements throughout library fix(database): resolve version-related deadlock and circular lock issues in database operations docs(docker): expand Docker deployment documentation with usage instructions for Docker Compose --- Dockerfile | 12 +- crawl4ai/async_crawler_strategy.py | 149 ++++++++++++---- crawl4ai/async_database.py | 189 +++++++++++++------- crawl4ai/async_logger.py | 231 +++++++++++++++++++++++++ crawl4ai/async_webcrawler.py | 144 +++++++++++---- crawl4ai/content_filter_strategy.py | 71 ++++---- crawl4ai/content_scrapping_strategy.py | 44 ++++- crawl4ai/version_manager.py | 4 +- docker-compose.hub.yml | 27 +++ docker-compose.local.yml | 33 ++++ docker-compose.yml | 47 ++++- docs/examples/v0.3.74.overview.py | 119 +++++++++---- docs/md_v2/basic/docker-deploymeny.md | 88 ++++++++++ main.py | 23 ++- requirements.txt | 3 +- 15 files changed, 952 insertions(+), 232 deletions(-) create mode 100644 crawl4ai/async_logger.py create mode 100644 docker-compose.hub.yml create mode 100644 docker-compose.local.yml diff --git a/Dockerfile b/Dockerfile index aac2280a..bd71deae 100644 --- a/Dockerfile +++ b/Dockerfile @@ -62,11 +62,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libatspi2.0-0 \ && rm -rf /var/lib/apt/lists/* -# GPU support if enabled -RUN if [ "$ENABLE_GPU" = "true" ] ; then \ - apt-get update && apt-get install -y --no-install-recommends \ - nvidia-cuda-toolkit \ - && rm -rf /var/lib/apt/lists/* ; \ +# GPU support if enabled and architecture is supported +RUN if [ "$ENABLE_GPU" = "true" ] && [ "$(dpkg --print-architecture)" != "arm64" ] ; then \ + apt-get update && apt-get install -y --no-install-recommends \ + nvidia-cuda-toolkit \ + && rm -rf /var/lib/apt/lists/* ; \ + else \ + echo "Skipping NVIDIA CUDA Toolkit installation (unsupported architecture or GPU disabled)"; \ fi # Create and set working directory diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 90d5cbe8..a6ba8e50 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -35,13 +35,15 @@ stealth_config = StealthConfig( class ManagedBrowser: - def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False): + def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None): self.browser_type = browser_type self.user_data_dir = user_data_dir self.headless = headless self.browser_process = None self.temp_dir = None self.debugging_port = 9222 + self.logger = logger + self.shutting_down = False async def start(self) -> str: """ @@ -76,15 +78,38 @@ class ManagedBrowser: async def _monitor_browser_process(self): """Monitor the browser process for unexpected termination.""" if self.browser_process: - stdout, stderr = await asyncio.gather( - asyncio.to_thread(self.browser_process.stdout.read), - asyncio.to_thread(self.browser_process.stderr.read) - ) - if self.browser_process.poll() is not None: - print(f"Browser process terminated unexpectedly with code {self.browser_process.returncode}") - print(f"STDOUT: {stdout.decode()}") - print(f"STDERR: {stderr.decode()}") - await self.cleanup() + try: + stdout, stderr = await asyncio.gather( + asyncio.to_thread(self.browser_process.stdout.read), + asyncio.to_thread(self.browser_process.stderr.read) + ) + + # Check shutting_down flag BEFORE logging anything + if self.browser_process.poll() is not None: + if not self.shutting_down: + self.logger.error( + message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", + tag="ERROR", + params={ + "code": self.browser_process.returncode, + "stdout": stdout.decode(), + "stderr": stderr.decode() + } + ) + await self.cleanup() + else: + self.logger.info( + message="Browser process terminated normally | Code: {code}", + tag="INFO", + params={"code": self.browser_process.returncode} + ) + except Exception as e: + if not self.shutting_down: + self.logger.error( + message="Error monitoring browser process: {error}", + tag="ERROR", + params={"error": str(e)} + ) def _get_browser_path(self) -> str: """Returns the browser executable path based on OS and browser type""" @@ -134,20 +159,39 @@ class ManagedBrowser: async def cleanup(self): """Cleanup browser process and temporary directory""" + # Set shutting_down flag BEFORE any termination actions + self.shutting_down = True + if self.browser_process: try: self.browser_process.terminate() - await asyncio.sleep(1) + # Wait for process to end gracefully + for _ in range(10): # 10 attempts, 100ms each + if self.browser_process.poll() is not None: + break + await asyncio.sleep(0.1) + + # Force kill if still running if self.browser_process.poll() is None: self.browser_process.kill() + await asyncio.sleep(0.1) # Brief wait for kill to take effect + except Exception as e: - print(f"Error terminating browser: {e}") + self.logger.error( + message="Error terminating browser: {error}", + tag="ERROR", + params={"error": str(e)} + ) if self.temp_dir and os.path.exists(self.temp_dir): try: shutil.rmtree(self.temp_dir) except Exception as e: - print(f"Error removing temporary directory: {e}") + self.logger.error( + message="Error removing temporary directory: {error}", + tag="ERROR", + params={"error": str(e)} + ) class AsyncCrawlerStrategy(ABC): @@ -172,7 +216,8 @@ class AsyncCrawlerStrategy(ABC): pass class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - def __init__(self, use_cached_html=False, js_code=None, **kwargs): + def __init__(self, use_cached_html=False, js_code=None, logger = None, **kwargs): + self.logger = logger self.use_cached_html = use_cached_html self.user_agent = kwargs.get( "user_agent", @@ -231,7 +276,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.managed_browser = ManagedBrowser( browser_type=self.browser_type, user_data_dir=self.user_data_dir, - headless=self.headless + headless=self.headless, + logger=self.logger ) cdp_url = await self.managed_browser.start() self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) @@ -282,6 +328,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Add extra args if provided if self.extra_args: browser_args["args"].extend(self.extra_args) + + # Add downloads path if downloads are enabled + if self.accept_downloads: + browser_args["downloads_path"] = self.downloads_path # Add proxy settings if a proxy is specified if self.proxy: @@ -344,6 +394,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.browser = None if self.managed_browser: + await asyncio.sleep(0.5) await self.managed_browser.cleanup() self.managed_browser = None @@ -491,9 +542,19 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): }} """) else: - print(f"Warning: Could not access content frame for iframe {i}") + # print(f"Warning: Could not access content frame for iframe {i}") + self.logger.warning( + message="Could not access content frame for iframe {index}", + tag="SCRAPE", + params={"index": i} + ) except Exception as e: - print(f"Error processing iframe {i}: {str(e)}") + self.logger.error( + message="Error processing iframe {index}: {error}", + tag="ERROR", + params={"index": i, "error": str(e)} + ) + # print(f"Error processing iframe {i}: {str(e)}") # Return the page object return page @@ -620,7 +681,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): context = await self.browser.new_context( user_agent=self.user_agent, viewport={"width": 1920, "height": 1080}, - proxy={"server": self.proxy} if self.proxy else None + proxy={"server": self.proxy} if self.proxy else None, + accept_downloads=self.accept_downloads, ) await context.set_extra_http_headers(self.headers) @@ -917,17 +979,31 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): suggested_filename = download.suggested_filename download_path = os.path.join(self.downloads_path, suggested_filename) - if self.verbose: - print(f"[LOG] 📥 Downloading {suggested_filename} to {download_path}") + self.logger.info( + message="Downloading {filename} to {path}", + tag="FETCH", + params={"filename": suggested_filename, "path": download_path} + ) + start_time = time.perf_counter() await download.save_as(download_path) + end_time = time.perf_counter() self._downloaded_files.append(download_path) - - if self.verbose: - print(f"[LOG] ✅ Downloaded {suggested_filename} successfully") + + self.logger.success( + message="Downloaded {filename} successfully", + tag="COMPLETE", + params={"filename": suggested_filename, "path": download_path, "duration": f"{end_time - start_time:.2f}s"} + ) except Exception as e: - if self.verbose: - print(f"[ERROR] Failed to handle download: {str(e)}") + self.logger.error( + message="Failed to handle download: {error}", + tag="ERROR", + params={"error": str(e)} + ) + + # if self.verbose: + # print(f"[ERROR] Failed to handle download: {str(e)}") async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed @@ -1070,8 +1146,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await page.evaluate(remove_overlays_js) await page.wait_for_timeout(500) # Wait for any animations to complete except Exception as e: - if self.verbose: - print(f"Warning: Failed to remove overlay elements: {str(e)}") + self.logger.warning( + message="Failed to remove overlay elements: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) + # if self.verbose: + # print(f"Warning: Failed to remove overlay elements: {str(e)}") async def take_screenshot(self, page: Page) -> str: """ @@ -1089,7 +1170,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return base64.b64encode(screenshot).decode('utf-8') except Exception as e: error_message = f"Failed to take screenshot: {str(e)}" - print(error_message) + self.logger.error( + message="Screenshot failed: {error}", + tag="ERROR", + params={"error": error_message} + ) + # Generate an error image img = Image.new('RGB', (800, 600), color='black') @@ -1123,7 +1209,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return base64.b64encode(screenshot).decode('utf-8') except Exception as e: error_message = f"Failed to take screenshot: {str(e)}" - print(error_message) + # print(error_message) + self.logger.error( + message="Screenshot failed: {error}", + tag="ERROR", + params={"error": error_message} + ) # Generate an error image img = Image.new('RGB', (800, 600), color='black') diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index 7809dfe1..19160b6e 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -12,10 +12,12 @@ import xxhash import aiofiles from .config import NEED_MIGRATION from .version_manager import VersionManager +from .async_logger import AsyncLogger # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +base_directory = Path.home() DB_PATH = os.path.join(Path.home(), ".crawl4ai") os.makedirs(DB_PATH, exist_ok=True) DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") @@ -28,15 +30,21 @@ class AsyncDatabaseManager: self.max_retries = max_retries self.connection_pool: Dict[int, aiosqlite.Connection] = {} self.pool_lock = asyncio.Lock() + self.init_lock = asyncio.Lock() self.connection_semaphore = asyncio.Semaphore(pool_size) self._initialized = False self.version_manager = VersionManager() + self.logger = AsyncLogger( + log_file=os.path.join(base_directory, ".crawl4ai", "crawler_db.log"), + verbose=False, + tag_width=10 + ) async def initialize(self): """Initialize the database and connection pool""" try: - logger.info("Initializing database...") + self.logger.info("Initializing database", tag="INIT") # Ensure the database file exists os.makedirs(os.path.dirname(self.db_path), exist_ok=True) @@ -47,31 +55,39 @@ class AsyncDatabaseManager: await self.ainit_db() # Verify the table exists - async def verify_table(db): + async with aiosqlite.connect(self.db_path, timeout=30.0) as db: async with db.execute( "SELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'" ) as cursor: result = await cursor.fetchone() if not result: raise Exception("crawled_data table was not created") - - await self.execute_with_retry(verify_table) # If version changed or fresh install, run updates if needs_update: - logger.info("New version detected, running updates...") + self.logger.info("New version detected, running updates", tag="INIT") await self.update_db_schema() from .migrations import run_migration # Import here to avoid circular imports await run_migration() self.version_manager.update_version() # Update stored version after successful migration - logger.info("Version update completed successfully") + self.logger.success("Version update completed successfully", tag="COMPLETE") else: - logger.info("Database initialization completed successfully") + self.logger.success("Database initialization completed successfully", tag="COMPLETE") + except Exception as e: - logger.error(f"Database initialization error: {e}") - logger.info("Database will be initialized on first use") + self.logger.error( + message="Database initialization error: {error}", + tag="ERROR", + params={"error": str(e)} + ) + self.logger.info( + message="Database will be initialized on first use", + tag="INIT" + ) + raise + async def cleanup(self): """Cleanup connections when shutting down""" @@ -84,34 +100,41 @@ class AsyncDatabaseManager: async def get_connection(self): """Connection pool manager""" if not self._initialized: - async with self.pool_lock: # Prevent multiple simultaneous initializations - if not self._initialized: # Double-check after acquiring lock + # Use an asyncio.Lock to ensure only one initialization occurs + async with self.init_lock: + if not self._initialized: await self.initialize() self._initialized = True - async with self.connection_semaphore: - task_id = id(asyncio.current_task()) - try: - async with self.pool_lock: - if task_id not in self.connection_pool: - conn = await aiosqlite.connect( - self.db_path, - timeout=30.0 - ) - await conn.execute('PRAGMA journal_mode = WAL') - await conn.execute('PRAGMA busy_timeout = 5000') - self.connection_pool[task_id] = conn - - yield self.connection_pool[task_id] - - except Exception as e: - logger.error(f"Connection error: {e}") - raise - finally: - async with self.pool_lock: - if task_id in self.connection_pool: - await self.connection_pool[task_id].close() - del self.connection_pool[task_id] + await self.connection_semaphore.acquire() + task_id = id(asyncio.current_task()) + try: + async with self.pool_lock: + if task_id not in self.connection_pool: + conn = await aiosqlite.connect( + self.db_path, + timeout=30.0 + ) + await conn.execute('PRAGMA journal_mode = WAL') + await conn.execute('PRAGMA busy_timeout = 5000') + self.connection_pool[task_id] = conn + + yield self.connection_pool[task_id] + + except Exception as e: + self.logger.error( + message="Connection error: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) + raise + finally: + async with self.pool_lock: + if task_id in self.connection_pool: + await self.connection_pool[task_id].close() + del self.connection_pool[task_id] + self.connection_semaphore.release() async def execute_with_retry(self, operation, *args): @@ -124,13 +147,21 @@ class AsyncDatabaseManager: return result except Exception as e: if attempt == self.max_retries - 1: - logger.error(f"Operation failed after {self.max_retries} attempts: {e}") + self.logger.error( + message="Operation failed after {retries} attempts: {error}", + tag="ERROR", + force_verbose=True, + params={ + "retries": self.max_retries, + "error": str(e) + } + ) raise await asyncio.sleep(1 * (attempt + 1)) # Exponential backoff async def ainit_db(self): """Initialize database schema""" - async def _init(db): + async with aiosqlite.connect(self.db_path, timeout=30.0) as db: await db.execute(''' CREATE TABLE IF NOT EXISTS crawled_data ( url TEXT PRIMARY KEY, @@ -147,36 +178,37 @@ class AsyncDatabaseManager: downloaded_files TEXT DEFAULT "{}" -- New column added ) ''') - - await self.execute_with_retry(_init) + await db.commit() + async def update_db_schema(self): """Update database schema if needed""" - async def _check_columns(db): + async with aiosqlite.connect(self.db_path, timeout=30.0) as db: cursor = await db.execute("PRAGMA table_info(crawled_data)") columns = await cursor.fetchall() - return [column[1] for column in columns] + column_names = [column[1] for column in columns] + + # List of new columns to add + new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files'] + + for column in new_columns: + if column not in column_names: + await self.aalter_db_add_column(column, db) + await db.commit() - column_names = await self.execute_with_retry(_check_columns) - - # List of new columns to add - new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files'] - - for column in new_columns: - if column not in column_names: - await self.aalter_db_add_column(column) - - async def aalter_db_add_column(self, new_column: str): + async def aalter_db_add_column(self, new_column: str, db): """Add new column to the database""" - async def _alter(db): - if new_column == 'response_headers': - await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"') - else: - await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') - logger.info(f"Added column '{new_column}' to the database.") + if new_column == 'response_headers': + await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"') + else: + await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') + self.logger.info( + message="Added column '{column}' to the database", + tag="INIT", + params={"column": new_column} + ) - await self.execute_with_retry(_alter) async def aget_cached_url(self, url: str) -> Optional[CrawlResult]: """Retrieve cached URL data as CrawlResult""" @@ -235,7 +267,12 @@ class AsyncDatabaseManager: try: return await self.execute_with_retry(_get) except Exception as e: - logger.error(f"Error retrieving cached URL: {e}") + self.logger.error( + message="Error retrieving cached URL: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) return None async def acache_url(self, result: CrawlResult): @@ -291,7 +328,13 @@ class AsyncDatabaseManager: try: await self.execute_with_retry(_cache) except Exception as e: - logger.error(f"Error caching URL: {e}") + self.logger.error( + message="Error caching URL: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) + async def aget_total_count(self) -> int: """Get total number of cached URLs""" @@ -303,7 +346,12 @@ class AsyncDatabaseManager: try: return await self.execute_with_retry(_count) except Exception as e: - logger.error(f"Error getting total count: {e}") + self.logger.error( + message="Error getting total count: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) return 0 async def aclear_db(self): @@ -314,7 +362,12 @@ class AsyncDatabaseManager: try: await self.execute_with_retry(_clear) except Exception as e: - logger.error(f"Error clearing database: {e}") + self.logger.error( + message="Error clearing database: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) async def aflush_db(self): """Drop the entire table""" @@ -324,7 +377,12 @@ class AsyncDatabaseManager: try: await self.execute_with_retry(_flush) except Exception as e: - logger.error(f"Error flushing database: {e}") + self.logger.error( + message="Error flushing database: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) async def _store_content(self, content: str, content_type: str) -> str: @@ -352,7 +410,12 @@ class AsyncDatabaseManager: async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: return await f.read() except: - logger.error(f"Failed to load content: {file_path}") + self.logger.error( + message="Failed to load content: {file_path}", + tag="ERROR", + force_verbose=True, + params={"file_path": file_path} + ) return None # Create a singleton instance diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py new file mode 100644 index 00000000..220edd11 --- /dev/null +++ b/crawl4ai/async_logger.py @@ -0,0 +1,231 @@ +from enum import Enum +from typing import Optional, Dict, Any, Union +from colorama import Fore, Back, Style, init +import time +import os +from datetime import datetime + +class LogLevel(Enum): + DEBUG = 1 + INFO = 2 + SUCCESS = 3 + WARNING = 4 + ERROR = 5 + +class AsyncLogger: + """ + Asynchronous logger with support for colored console output and file logging. + Supports templated messages with colored components. + """ + + DEFAULT_ICONS = { + 'INIT': '→', + 'READY': '✓', + 'FETCH': '↓', + 'SCRAPE': '◆', + 'EXTRACT': '■', + 'COMPLETE': '●', + 'ERROR': '×', + 'DEBUG': '⋯', + 'INFO': 'ℹ', + 'WARNING': '⚠', + } + + DEFAULT_COLORS = { + LogLevel.DEBUG: Fore.LIGHTBLACK_EX, + LogLevel.INFO: Fore.CYAN, + LogLevel.SUCCESS: Fore.GREEN, + LogLevel.WARNING: Fore.YELLOW, + LogLevel.ERROR: Fore.RED, + } + + def __init__( + self, + log_file: Optional[str] = None, + log_level: LogLevel = LogLevel.INFO, + tag_width: int = 10, + icons: Optional[Dict[str, str]] = None, + colors: Optional[Dict[LogLevel, str]] = None, + verbose: bool = True + ): + """ + Initialize the logger. + + Args: + log_file: Optional file path for logging + log_level: Minimum log level to display + tag_width: Width for tag formatting + icons: Custom icons for different tags + colors: Custom colors for different log levels + verbose: Whether to output to console + """ + init() # Initialize colorama + self.log_file = log_file + self.log_level = log_level + self.tag_width = tag_width + self.icons = icons or self.DEFAULT_ICONS + self.colors = colors or self.DEFAULT_COLORS + self.verbose = verbose + + # Create log file directory if needed + if log_file: + os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True) + + def _format_tag(self, tag: str) -> str: + """Format a tag with consistent width.""" + return f"[{tag}]".ljust(self.tag_width, ".") + + def _get_icon(self, tag: str) -> str: + """Get the icon for a tag, defaulting to info icon if not found.""" + return self.icons.get(tag, self.icons['INFO']) + + def _write_to_file(self, message: str): + """Write a message to the log file if configured.""" + if self.log_file: + timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] + with open(self.log_file, 'a', encoding='utf-8') as f: + # Strip ANSI color codes for file output + clean_message = message.replace(Fore.RESET, '').replace(Style.RESET_ALL, '') + for color in vars(Fore).values(): + if isinstance(color, str): + clean_message = clean_message.replace(color, '') + f.write(f"[{timestamp}] {clean_message}\n") + + def _log( + self, + level: LogLevel, + message: str, + tag: str, + params: Optional[Dict[str, Any]] = None, + colors: Optional[Dict[str, str]] = None, + base_color: Optional[str] = None, + **kwargs + ): + """ + Core logging method that handles message formatting and output. + + Args: + level: Log level for this message + message: Message template string + tag: Tag for the message + params: Parameters to format into the message + colors: Color overrides for specific parameters + base_color: Base color for the entire message + """ + if level.value < self.log_level.value: + return + + # Format the message with parameters if provided + if params: + try: + # First format the message with raw parameters + formatted_message = message.format(**params) + + # Then apply colors if specified + if colors: + for key, color in colors.items(): + # Find the formatted value in the message and wrap it with color + if key in params: + value_str = str(params[key]) + formatted_message = formatted_message.replace( + value_str, + f"{color}{value_str}{Style.RESET_ALL}" + ) + + except KeyError as e: + formatted_message = f"LOGGING ERROR: Missing parameter {e} in message template" + level = LogLevel.ERROR + else: + formatted_message = message + + # Construct the full log line + color = base_color or self.colors[level] + log_line = f"{color}{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message}{Style.RESET_ALL}" + + # Output to console if verbose + if self.verbose or kwargs.get("force_verbose", False): + print(log_line) + + # Write to file if configured + self._write_to_file(log_line) + + def debug(self, message: str, tag: str = "DEBUG", **kwargs): + """Log a debug message.""" + self._log(LogLevel.DEBUG, message, tag, **kwargs) + + def info(self, message: str, tag: str = "INFO", **kwargs): + """Log an info message.""" + self._log(LogLevel.INFO, message, tag, **kwargs) + + def success(self, message: str, tag: str = "SUCCESS", **kwargs): + """Log a success message.""" + self._log(LogLevel.SUCCESS, message, tag, **kwargs) + + def warning(self, message: str, tag: str = "WARNING", **kwargs): + """Log a warning message.""" + self._log(LogLevel.WARNING, message, tag, **kwargs) + + def error(self, message: str, tag: str = "ERROR", **kwargs): + """Log an error message.""" + self._log(LogLevel.ERROR, message, tag, **kwargs) + + def url_status( + self, + url: str, + success: bool, + timing: float, + tag: str = "FETCH", + url_length: int = 50 + ): + """ + Convenience method for logging URL fetch status. + + Args: + url: The URL being processed + success: Whether the operation was successful + timing: Time taken for the operation + tag: Tag for the message + url_length: Maximum length for URL in log + """ + self._log( + level=LogLevel.SUCCESS if success else LogLevel.ERROR, + message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s", + tag=tag, + params={ + "url": url, + "url_length": url_length, + "status": success, + "timing": timing + }, + colors={ + "status": Fore.GREEN if success else Fore.RED, + "timing": Fore.YELLOW + } + ) + + def error_status( + self, + url: str, + error: str, + tag: str = "ERROR", + url_length: int = 50 + ): + """ + Convenience method for logging error status. + + Args: + url: The URL being processed + error: Error message + tag: Tag for the message + url_length: Maximum length for URL in log + """ + self._log( + level=LogLevel.ERROR, + message="{url:.{url_length}}... | Error: {error}", + tag=tag, + params={ + "url": url, + "url_length": url_length, + "error": error + } + ) \ No newline at end of file diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 79a17ac4..5fe7822c 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -15,6 +15,7 @@ from .extraction_strategy import * from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode from .content_scrapping_strategy import WebScrapingStrategy +from .async_logger import AsyncLogger from .config import ( MIN_WORD_THRESHOLD, @@ -74,19 +75,29 @@ class AsyncWebCrawler: always_by_pass_cache: Deprecated, use always_bypass_cache instead base_directory: Base directory for storing cache """ - init() - self.log_width = 10 # Width of "[COMPLETE]" - self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".") - self.log_icons = { - 'INIT': '→', # Alternative: '▶' or '►' - 'READY': '✓', # Alternative: '√' - 'FETCH': '↓', # Alternative: '▼' - 'SCRAPE': '◆', # Alternative: '♦' - 'EXTRACT': '■', # Alternative: '□' - 'COMPLETE': '●', # Alternative: '○' - 'ERROR': '×' - } - self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(**kwargs) + # init() + # self.log_width = 10 # Width of "[COMPLETE]" + # self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".") + # self.log_icons = { + # 'INIT': '→', # Alternative: '▶' or '►' + # 'READY': '✓', # Alternative: '√' + # 'FETCH': '↓', # Alternative: '▼' + # 'SCRAPE': '◆', # Alternative: '♦' + # 'EXTRACT': '■', # Alternative: '□' + # 'COMPLETE': '●', # Alternative: '○' + # 'ERROR': '×' + # } + self.verbose = kwargs.get("verbose", False) + self.logger = AsyncLogger( + log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"), + verbose=self.verbose, + tag_width=10 + ) + + self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( + logger = self.logger, + **kwargs + ) # Handle deprecated parameter if always_by_pass_cache is not None: @@ -118,12 +129,13 @@ class AsyncWebCrawler: async def awarmup(self): """Initialize the crawler with warm-up sequence.""" - if self.verbose: - print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Crawl4AI {crawl4ai_version}{Style.RESET_ALL}") - print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Warming up AsyncWebCrawler{Style.RESET_ALL}") + self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") + # if self.verbose: + # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Crawl4AI {crawl4ai_version}{Style.RESET_ALL}") + # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Warming up AsyncWebCrawler{Style.RESET_ALL}") self.ready = True - if self.verbose: - print(f"{Fore.GREEN}{self.tag_format('READY')} {self.log_icons['READY']} AsyncWebCrawler initialized{Style.RESET_ALL}") + # if self.verbose: + # print(f"{Fore.GREEN}{self.tag_format('READY')} {self.log_icons['READY']} AsyncWebCrawler initialized{Style.RESET_ALL}") async def arun( self, @@ -234,8 +246,14 @@ class AsyncWebCrawler: screenshot_data = cached_result.screenshot if not screenshot_data: cached_result = None - if verbose: - print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s") + # if verbose: + # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s") + self.logger.url_status( + url=cache_context.display_url, + success=bool(html), + timing=time.perf_counter() - start_time, + tag="FETCH" + ) # Fetch fresh content if needed @@ -252,8 +270,14 @@ class AsyncWebCrawler: html = sanitize_input_encode(async_response.html) screenshot_data = async_response.screenshot t2 = time.perf_counter() - if verbose: - print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") + self.logger.url_status( + url=cache_context.display_url, + success=bool(html), + timing=t2 - t1, + tag="FETCH" + ) + # if verbose: + # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") # Process the HTML content crawl_result = await self.aprocess_html( @@ -287,9 +311,21 @@ class AsyncWebCrawler: crawl_result.success = bool(html) crawl_result.session_id = kwargs.get("session_id", None) - if verbose: - print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") - + # if verbose: + # print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") + self.logger.success( + message="{url:.50}... | Status: {status} | Total: {timing}", + tag="COMPLETE", + params={ + "url": cache_context.display_url, + "status": crawl_result.success, + "timing": f"{time.perf_counter() - start_time:.2f}s" + }, + colors={ + "status": Fore.GREEN if crawl_result.success else Fore.RED, + "timing": Fore.YELLOW + } + ) # Update cache if appropriate if cache_context.should_write() and not bool(cached_result): @@ -300,7 +336,12 @@ class AsyncWebCrawler: except Exception as e: if not hasattr(e, "msg"): e.msg = str(e) - print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") + # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") + self.logger.error_status( + url=cache_context.display_url, + error=e.msg, + tag="ERROR" + ) return CrawlResult( url=url, html="", @@ -362,7 +403,12 @@ class AsyncWebCrawler: domain = urlparse(url).netloc current_time = time.time() - print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}") + # print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}") + self.logger.debug( + message="Started task for {url:.50}...", + tag="PARALLEL", + params={"url": url} + ) # Get delay settings from kwargs or use defaults mean_delay = kwargs.get('mean_delay', 0.1) # 0.5 seconds default mean delay @@ -394,12 +440,26 @@ class AsyncWebCrawler: ) # Print start message - print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}") + # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}") + self.logger.info( + message="Starting concurrent crawling for {count} URLs...", + tag="INIT", + params={"count": len(urls)} + ) start_time = time.perf_counter() tasks = [crawl_with_semaphore(url) for url in urls] results = await asyncio.gather(*tasks, return_exceptions=True) end_time = time.perf_counter() - print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}") + # print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}") + self.logger.success( + message="Concurrent crawling completed for {count} URLs | " + Fore.YELLOW + " Total time: {timing}" + Style.RESET_ALL, + tag="COMPLETE", + params={ + "count": len(urls), + "timing": f"{end_time - start_time:.2f}s" + }, + colors={"timing": Fore.YELLOW} + ) return [result if not isinstance(result, Exception) else str(result) for result in results] @@ -451,9 +511,16 @@ class AsyncWebCrawler: links = result.get("links", []) metadata = result.get("metadata", {}) - if verbose: - print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms") - + # if verbose: + # print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms") + self.logger.info( + message="Processed {url:.50}... | Time: {timing}ms", + tag="SCRAPE", + params={ + "url": _url, + "timing": int((time.perf_counter() - t1) * 1000) + } + ) if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy): @@ -467,8 +534,17 @@ class AsyncWebCrawler: sections = chunking_strategy.chunk(markdown) extracted_content = extraction_strategy.run(url, sections) extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) - if verbose: - print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}") + # if verbose: + # print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}") + self.logger.info( + message="Completed for {url:.50}... | Time: {timing}s", + tag="EXTRACT", + params={ + "url": _url, + "timing": time.perf_counter() - t1 + } + ) + diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py index 88375da9..88216f7f 100644 --- a/crawl4ai/content_filter_strategy.py +++ b/crawl4ai/content_filter_strategy.py @@ -8,6 +8,10 @@ from bs4 import BeautifulSoup, NavigableString, Tag from .utils import clean_tokens from abc import ABC, abstractmethod +from snowballstemmer import stemmer + +# from nltk.stem import PorterStemmer +# ps = PorterStemmer() class RelevantContentFilter(ABC): def __init__(self, user_query: str = None): self.user_query = user_query @@ -252,7 +256,7 @@ class RelevantContentFilter(ABC): return str(tag) # Fallback to original if anything fails class BM25ContentFilter(RelevantContentFilter): - def __init__(self, user_query: str = None, bm25_threshold: float = 1.0): + def __init__(self, user_query: str = None, bm25_threshold: float = 1.0, language: str = 'english'): super().__init__(user_query=user_query) self.bm25_threshold = bm25_threshold self.priority_tags = { @@ -268,6 +272,7 @@ class BM25ContentFilter(RelevantContentFilter): 'pre': 1.5, 'th': 1.5, # Table headers } + self.stemmer = stemmer(language) def filter_content(self, html: str) -> List[str]: """Implements content filtering using BM25 algorithm with priority tag handling""" @@ -282,58 +287,42 @@ class BM25ContentFilter(RelevantContentFilter): if not candidates: return [] - # Split into priority and regular candidates - priority_candidates = [] - regular_candidates = [] + # Tokenize corpus + # tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in candidates] + # tokenized_query = query.lower().split() + + # tokenized_corpus = [[ps.stem(word) for word in chunk.lower().split()] + # for _, chunk, _, _ in candidates] + # tokenized_query = [ps.stem(word) for word in query.lower().split()] - for index, chunk, tag_type, tag in candidates: - if tag.name in self.priority_tags: - priority_candidates.append((index, chunk, tag_type, tag)) - else: - regular_candidates.append((index, chunk, tag_type, tag)) + tokenized_corpus = [[self.stemmer.stemWord(word) for word in chunk.lower().split()] + for _, chunk, _, _ in candidates] + tokenized_query = [self.stemmer.stemWord(word) for word in query.lower().split()] - # Process regular content with BM25 - tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in regular_candidates] - tokenized_query = query.lower().split() - # Clean from stop words and noise tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus] tokenized_query = clean_tokens(tokenized_query) - + bm25 = BM25Okapi(tokenized_corpus) scores = bm25.get_scores(tokenized_query) - # Score and boost regular candidates - scored_candidates = [ - (score * self.priority_tags.get(tag.name, 1.0), index, chunk, tag_type, tag) - for score, (index, chunk, tag_type, tag) in zip(scores, regular_candidates) + # Adjust scores with tag weights + adjusted_candidates = [] + for score, (index, chunk, tag_type, tag) in zip(scores, candidates): + tag_weight = self.priority_tags.get(tag.name, 1.0) + adjusted_score = score * tag_weight + adjusted_candidates.append((adjusted_score, index, chunk, tag)) + + # Filter candidates by threshold + selected_candidates = [ + (index, chunk, tag) for adjusted_score, index, chunk, tag in adjusted_candidates + if adjusted_score >= self.bm25_threshold ] - scored_candidates.sort(key=lambda x: x[0], reverse=True) - - # Process scored candidates - selected_tags = set() - selected_candidates = [] - - # First add all priority candidates - for index, chunk, tag_type, tag in priority_candidates: - tag_id = id(tag) - if tag_id not in selected_tags: - selected_candidates.append((index, chunk, tag)) - selected_tags.add(tag_id) - - # Then add scored regular candidates that meet threshold - for score, index, chunk, tag_type, tag in scored_candidates: - if score < self.bm25_threshold: - continue - tag_id = id(tag) - if tag_id not in selected_tags: - selected_candidates.append((index, chunk, tag)) - selected_tags.add(tag_id) if not selected_candidates: return [] - # Sort by original document order + # Sort selected candidates by original document order selected_candidates.sort(key=lambda x: x[0]) - return [self.clean_element(tag) for _, _, tag in selected_candidates] + return [self.clean_element(tag) for _, _, tag in selected_candidates] diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index d16b0680..0f470671 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -149,6 +149,15 @@ class ContentScrapingStrategy(ABC): pass class WebScrapingStrategy(ContentScrapingStrategy): + def __init__(self, logger=None): + self.logger = logger + + def _log(self, level, message, tag="SCRAPE", **kwargs): + """Helper method to safely use logger.""" + if self.logger: + log_method = getattr(self.logger, level) + log_method(message=message, tag=tag, **kwargs) + def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: return self._get_content_of_website_optimized(url, html, is_async=False, **kwargs) @@ -167,7 +176,12 @@ class WebScrapingStrategy(ContentScrapingStrategy): try: meta = extract_metadata("", soup) except Exception as e: - print('Error extracting metadata:', str(e)) + self._log('error', + message="Error extracting metadata: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) + # print('Error extracting metadata:', str(e)) meta = {} @@ -430,9 +444,12 @@ class WebScrapingStrategy(ContentScrapingStrategy): try: remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False)) except Exception as e: - print('Error removing unwanted attributes:', str(e)) - - + # print('Error removing unwanted attributes:', str(e)) + self._log('error', + message="Error removing unwanted attributes: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) # Process children for child in list(element.children): if isinstance(child, NavigableString) and not isinstance(child, Comment): @@ -453,7 +470,12 @@ class WebScrapingStrategy(ContentScrapingStrategy): return keep_element except Exception as e: - print('Error processing element:', str(e)) + # print('Error processing element:', str(e)) + self._log('error', + message="Error processing element: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) return False process_element(body) @@ -516,7 +538,10 @@ class WebScrapingStrategy(ContentScrapingStrategy): str_body = body.encode_contents().decode('utf-8') print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.") - + self._log('error', + message="After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.", + tag="SCRAPE" + ) cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ') @@ -525,6 +550,13 @@ class WebScrapingStrategy(ContentScrapingStrategy): h.update_params(**kwargs.get('html2text', {})) markdown = h.handle(cleaned_html) except Exception as e: + if not h: + h = CustomHTML2Text() + self._log('error', + message="Error converting HTML to markdown: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) markdown = h.handle(sanitize_html(cleaned_html)) markdown = markdown.replace(' ```', '```') diff --git a/crawl4ai/version_manager.py b/crawl4ai/version_manager.py index 07e0c0e9..8ae2de2e 100644 --- a/crawl4ai/version_manager.py +++ b/crawl4ai/version_manager.py @@ -20,11 +20,11 @@ class VersionManager: def update_version(self): """Update the version file to current library version""" - self.version_file.write_text(__version__) + self.version_file.write_text(__version__.__version__) def needs_update(self): """Check if database needs update based on version""" installed = self.get_installed_version() - current = version.parse(__version__) + current = version.parse(__version__.__version__) return installed is None or installed < current diff --git a/docker-compose.hub.yml b/docker-compose.hub.yml new file mode 100644 index 00000000..9bcfa982 --- /dev/null +++ b/docker-compose.hub.yml @@ -0,0 +1,27 @@ +services: + crawl4ai: + image: unclecode/crawl4ai:basic # Pull image from Docker Hub + ports: + - "11235:11235" # FastAPI server + - "8000:8000" # Alternative port + - "9222:9222" # Browser debugging + - "8080:8080" # Additional port + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token + - OPENAI_API_KEY=${OPENAI_API_KEY:-} # Optional OpenAI API key + - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} # Optional Claude API key + volumes: + - /dev/shm:/dev/shm # Shared memory for browser operations + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 1G + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:11235/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s diff --git a/docker-compose.local.yml b/docker-compose.local.yml new file mode 100644 index 00000000..7dc41b47 --- /dev/null +++ b/docker-compose.local.yml @@ -0,0 +1,33 @@ +services: + crawl4ai: + build: + context: . + dockerfile: Dockerfile + args: + PYTHON_VERSION: 3.10 + INSTALL_TYPE: all + ENABLE_GPU: false + ports: + - "11235:11235" # FastAPI server + - "8000:8000" # Alternative port + - "9222:9222" # Browser debugging + - "8080:8080" # Additional port + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token + - OPENAI_API_KEY=${OPENAI_API_KEY:-} # Optional OpenAI API key + - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} # Optional Claude API key + volumes: + - /dev/shm:/dev/shm # Shared memory for browser operations + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 1G + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:11235/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index ef0dc9e4..1097ef11 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,3 @@ -version: '3.8' - services: crawl4ai: build: @@ -9,15 +7,18 @@ services: PYTHON_VERSION: 3.10 INSTALL_TYPE: all ENABLE_GPU: false + profiles: ["local"] ports: - - "11235:11235" # FastAPI server - - "8000:8000" # Alternative port - - "9222:9222" # Browser debugging - - "8080:8080" # Additional port + - "11235:11235" + - "8000:8000" + - "9222:9222" + - "8080:8080" environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} + - OPENAI_API_KEY=${OPENAI_API_KEY:-} + - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} volumes: - - /dev/shm:/dev/shm # Shared memory for browser operations + - /dev/shm:/dev/shm deploy: resources: limits: @@ -30,4 +31,32 @@ services: interval: 30s timeout: 10s retries: 3 - start_period: 40s \ No newline at end of file + start_period: 40s + + crawl4ai-hub: + image: unclecode/crawl4ai:basic + profiles: ["hub"] + ports: + - "11235:11235" + - "8000:8000" + - "9222:9222" + - "8080:8080" + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} + - OPENAI_API_KEY=${OPENAI_API_KEY:-} + - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} + volumes: + - /dev/shm:/dev/shm + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 1G + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:11235/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s diff --git a/docs/examples/v0.3.74.overview.py b/docs/examples/v0.3.74.overview.py index ec3a7d73..00296740 100644 --- a/docs/examples/v0.3.74.overview.py +++ b/docs/examples/v0.3.74.overview.py @@ -1,9 +1,16 @@ +import os, sys +# append the parent directory to the sys.path +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) +parent_parent_dir = os.path.dirname(parent_dir) +sys.path.append(parent_parent_dir) +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) +__data__ = os.path.join(__location__, "__data") import asyncio -import os from pathlib import Path import aiohttp import json -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai.content_filter_strategy import BM25ContentFilter # 1. File Download Processing Example @@ -32,7 +39,8 @@ async def download_example(): console.log('No .exe download link found'); } """, - wait_for=5 # Wait 5 seconds to ensure download starts + delay_before_return_html=1, # Wait 5 seconds to ensure download starts + cache_mode=CacheMode.BYPASS ) if result.downloaded_files: @@ -50,22 +58,32 @@ async def content_filtering_example(): async with AsyncWebCrawler(verbose=True) as crawler: # Create filter with custom query for OpenAI's blog content_filter = BM25ContentFilter( - user_query="AI language models research innovation", + # user_query="Investment and fundraising", + # user_query="Robotic", bm25_threshold=1.0 ) result = await crawler.arun( - url="https://openai.com/blog", - content_filter=content_filter + url="https://techcrunch.com/", + content_filter=content_filter, + cache_mode=CacheMode.BYPASS ) - print(f"Filtered content: {result.extracted_content}") + print(f"Filtered content: {len(result.fit_markdown)}") + print(f"Filtered content: {result.fit_markdown}") + + # Save html + with open(os.path.join(__data__, "techcrunch.html"), "w") as f: + f.write(result.fit_html) + + with open(os.path.join(__data__, "filtered_content.md"), "w") as f: + f.write(result.fit_markdown) # 3. Local File and Raw HTML Processing Example async def local_and_raw_html_example(): """Example of processing local files and raw HTML""" # Create a sample HTML file - sample_file = "sample.html" + sample_file = os.path.join(__data__, "sample.html") with open(sample_file, "w") as f: f.write(""" @@ -112,21 +130,18 @@ async def browser_management_example(): headless=False, verbose=True ) as crawler: + + result = await crawler.arun( + url="https://crawl4ai.com", + # session_id="persistent_session_1", + cache_mode=CacheMode.BYPASS + ) # Use GitHub as an example - it's a good test for browser management # because it requires proper browser handling result = await crawler.arun( url="https://github.com/trending", - session_id="persistent_session_1", - js_code=""" - // Custom JavaScript to execute on GitHub's trending page - const repos = document.querySelectorAll('article.Box-row'); - const data = Array.from(repos).map(repo => ({ - name: repo.querySelector('h2')?.textContent?.trim(), - description: repo.querySelector('p')?.textContent?.trim(), - language: repo.querySelector('[itemprop="programmingLanguage"]')?.textContent?.trim() - })); - console.log('Trending repositories:', JSON.stringify(data, null, 2)); - """ + # session_id="persistent_session_1", + cache_mode=CacheMode.BYPASS ) print("\nBrowser session result:", result.success) @@ -136,6 +151,8 @@ async def browser_management_example(): # 5. API Usage Example async def api_example(): """Example of using the new API endpoints""" + api_token = os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code" + headers = {'Authorization': f'Bearer {api_token}'} async with aiohttp.ClientSession() as session: # Submit crawl job crawl_request = { @@ -143,52 +160,78 @@ async def api_example(): "extraction_config": { "type": "json_css", "params": { - "selectors": { - "titles": ".title a", - "scores": ".score", - "comments": ".comment-tree" + "schema": { + "name": "Hacker News Articles", + "baseSelector": ".athing", + "fields": [ + { + "name": "title", + "selector": ".title a", + "type": "text" + }, + { + "name": "score", + "selector": ".score", + "type": "text" + }, + { + "name": "url", + "selector": ".title a", + "type": "attribute", + "attribute": "href" + } + ] } } }, "crawler_params": { "headless": True, - "use_managed_browser": True + # "use_managed_browser": True }, - "screenshot": True, - "magic": True + "cache_mode": "bypass", + # "screenshot": True, + # "magic": True } async with session.post( "http://localhost:11235/crawl", - json=crawl_request + json=crawl_request, + headers=headers ) as response: task_data = await response.json() task_id = task_data["task_id"] # Check task status - async with session.get( - f"http://localhost:11235/task/{task_id}" - ) as status_response: - result = await status_response.json() - print(f"Task result: {result}") + while True: + async with session.get( + f"http://localhost:11235/task/{task_id}", + headers=headers + ) as status_response: + result = await status_response.json() + print(f"Task result: {result}") + + if result["status"] == "completed": + break + else: + await asyncio.sleep(1) # Main execution async def main(): - print("Running Crawl4AI feature examples...") + # print("Running Crawl4AI feature examples...") - print("\n1. Running Download Example:") + # print("\n1. Running Download Example:") await download_example() - print("\n2. Running Content Filtering Example:") + # print("\n2. Running Content Filtering Example:") await content_filtering_example() - print("\n3. Running Local and Raw HTML Example:") + # print("\n3. Running Local and Raw HTML Example:") await local_and_raw_html_example() - print("\n4. Running Browser Management Example:") + # print("\n4. Running Browser Management Example:") await browser_management_example() - print("\n5. Running API Example:") + # print("\n5. Running API Example:") await api_example() if __name__ == "__main__": diff --git a/docs/md_v2/basic/docker-deploymeny.md b/docs/md_v2/basic/docker-deploymeny.md index 30555708..87e468aa 100644 --- a/docs/md_v2/basic/docker-deploymeny.md +++ b/docs/md_v2/basic/docker-deploymeny.md @@ -15,6 +15,94 @@ docker run -p 11235:11235 unclecode/crawl4ai:basic docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:basic ``` +## Running with Docker Compose 🐳 + +### Use Docker Compose (From Local Dockerfile or Docker Hub) + +Crawl4AI provides flexibility to use Docker Compose for managing your containerized services. You can either build the image locally from the provided `Dockerfile` or use the pre-built image from Docker Hub. + +### **Option 1: Using Docker Compose to Build Locally** +If you want to build the image locally, use the provided `docker-compose.local.yml` file. + +```bash +docker-compose -f docker-compose.local.yml up -d +``` + +This will: +1. Build the Docker image from the provided `Dockerfile`. +2. Start the container and expose it on `http://localhost:11235`. + +--- + +### **Option 2: Using Docker Compose with Pre-Built Image from Hub** +If you prefer using the pre-built image on Docker Hub, use the `docker-compose.hub.yml` file. + +```bash +docker-compose -f docker-compose.hub.yml up -d +``` + +This will: +1. Pull the pre-built image `unclecode/crawl4ai:basic` (or `all`, depending on your configuration). +2. Start the container and expose it on `http://localhost:11235`. + +--- + +### **Stopping the Running Services** + +To stop the services started via Docker Compose, you can use: + +```bash +docker-compose -f docker-compose.local.yml down +# OR +docker-compose -f docker-compose.hub.yml down +``` + +If the containers don’t stop and the application is still running, check the running containers: + +```bash +docker ps +``` + +Find the `CONTAINER ID` of the running service and stop it forcefully: + +```bash +docker stop +``` + +--- + +### **Debugging with Docker Compose** + +- **Check Logs**: To view the container logs: + ```bash + docker-compose -f docker-compose.local.yml logs -f + ``` + +- **Remove Orphaned Containers**: If the service is still running unexpectedly: + ```bash + docker-compose -f docker-compose.local.yml down --remove-orphans + ``` + +- **Manually Remove Network**: If the network is still in use: + ```bash + docker network ls + docker network rm crawl4ai_default + ``` + +--- + +### Why Use Docker Compose? + +Docker Compose is the recommended way to deploy Crawl4AI because: +1. It simplifies multi-container setups. +2. Allows you to define environment variables, resources, and ports in a single file. +3. Makes it easier to switch between local development and production-ready images. + +For example, your `docker-compose.yml` could include API keys, token settings, and memory limits, making deployment quick and consistent. + + + + ## API Security 🔒 ### Understanding CRAWL4AI_API_TOKEN diff --git a/main.py b/main.py index ee5f7fc6..6d217410 100644 --- a/main.py +++ b/main.py @@ -26,6 +26,7 @@ from enum import Enum from dataclasses import dataclass import json from crawl4ai import AsyncWebCrawler, CrawlResult, CacheMode +from crawl4ai.config import MIN_WORD_THRESHOLD from crawl4ai.extraction_strategy import ( LLMExtractionStrategy, CosineStrategy, @@ -53,12 +54,20 @@ class ExtractionConfig(BaseModel): type: CrawlerType params: Dict[str, Any] = {} +class ChunkingStrategy(BaseModel): + type: str + params: Dict[str, Any] = {} + +class ContentFilter(BaseModel): + type: str = "bm25" + params: Dict[str, Any] = {} + class CrawlRequest(BaseModel): urls: Union[HttpUrl, List[HttpUrl]] + word_count_threshold: int = MIN_WORD_THRESHOLD extraction_config: Optional[ExtractionConfig] = None - crawler_params: Dict[str, Any] = {} - priority: int = Field(default=5, ge=1, le=10) - ttl: Optional[int] = 3600 + chunking_strategy: Optional[ChunkingStrategy] = None + content_filter: Optional[ContentFilter] = None js_code: Optional[List[str]] = None wait_for: Optional[str] = None css_selector: Optional[str] = None @@ -66,7 +75,10 @@ class CrawlRequest(BaseModel): magic: bool = False extra: Optional[Dict[str, Any]] = {} session_id: Optional[str] = None - cache_mode: Optional[CacheMode] = None + cache_mode: Optional[CacheMode] = CacheMode.ENABLED + priority: int = Field(default=5, ge=1, le=10) + ttl: Optional[int] = 3600 + crawler_params: Dict[str, Any] = {} @dataclass class TaskInfo: @@ -280,6 +292,7 @@ class CrawlerService: if isinstance(request.urls, list): results = await crawler.arun_many( urls=[str(url) for url in request.urls], + word_count_threshold=MIN_WORD_THRESHOLD, extraction_strategy=extraction_strategy, js_code=request.js_code, wait_for=request.wait_for, @@ -287,6 +300,7 @@ class CrawlerService: screenshot=request.screenshot, magic=request.magic, session_id=request.session_id, + cache_mode=request.cache_mode, **request.extra, ) else: @@ -299,6 +313,7 @@ class CrawlerService: screenshot=request.screenshot, magic=request.magic, session_id=request.session_id, + cache_mode=request.cache_mode, **request.extra, ) diff --git a/requirements.txt b/requirements.txt index e6294cc5..ed259ac9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,5 @@ tf-playwright-stealth~=1.0 xxhash~=3.4 rank-bm25~=0.2 aiofiles~=24.0 -colorama~=0.4 \ No newline at end of file +colorama~=0.4 +snowballstemmer~=2.2 \ No newline at end of file From 73658c758affac33d1c96ce274735025012da370 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 19 Nov 2024 16:10:43 +0800 Subject: [PATCH 046/115] chore: update .gitignore to include manage-collab.sh --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 0acec10f..da4b5f88 100644 --- a/.gitignore +++ b/.gitignore @@ -210,4 +210,5 @@ git_issues.md .issues/ .docs/ .issues/ -.gitboss/ \ No newline at end of file +.gitboss/ +manage-collab.sh \ No newline at end of file From 593c7ad307489edc6a12f2f594bc7827aacbc6f7 Mon Sep 17 00:00:00 2001 From: ntohidikplay <“nasrin@kplay”.team> Date: Tue, 19 Nov 2024 11:45:26 +0100 Subject: [PATCH 047/115] test: trying to push to main --- test.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 test.txt diff --git a/test.txt b/test.txt new file mode 100644 index 00000000..e69de29b From 3aae30ed2a2fdd57e1bb9b6374238247d1013974 Mon Sep 17 00:00:00 2001 From: ntohidikplay <“nasrin@kplay”.team> Date: Tue, 19 Nov 2024 11:57:07 +0100 Subject: [PATCH 048/115] test1: trying to push to main --- test1.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 test1.txt diff --git a/test1.txt b/test1.txt new file mode 100644 index 00000000..e69de29b From 2f19d386930b48f6758053dd4791b3da9e3a0f29 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 19 Nov 2024 19:02:41 +0800 Subject: [PATCH 049/115] Update .gitignore to include .gitboss/ and todo_executor.md --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 4c3e151e..b92a0b0d 100644 --- a/.gitignore +++ b/.gitignore @@ -208,4 +208,6 @@ git_issues.md .tests/ .issues/ .docs/ -.issues/ \ No newline at end of file +.issues/ +.gitboss/ +todo_executor.md \ No newline at end of file From fbcff85ecb6d189fe77ca979017de9e3415481ce Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 19 Nov 2024 19:03:23 +0800 Subject: [PATCH 050/115] Remove test files --- test.txt | 0 test1.txt | 0 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 test.txt delete mode 100644 test1.txt diff --git a/test.txt b/test.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/test1.txt b/test1.txt deleted file mode 100644 index e69de29b..00000000 From a6dad3fc6d436af25f65c083c0f3cb2d6f8f9fc1 Mon Sep 17 00:00:00 2001 From: ntohidikplay <“nasrin@kplay”.team> Date: Tue, 19 Nov 2024 12:09:33 +0100 Subject: [PATCH 051/115] test: trying to push to 0.3.74 --- test3.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 test3.txt diff --git a/test3.txt b/test3.txt new file mode 100644 index 00000000..e69de29b From f2cb7d506dbe78bd29d6d6b32bd56f43ec44b352 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 19 Nov 2024 19:12:14 +0800 Subject: [PATCH 052/115] Delete test3.txt --- test3.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 test3.txt diff --git a/test3.txt b/test3.txt deleted file mode 100644 index e69de29b..00000000 From b654c49e55194da47945e726fe18a5fbded68062 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 19 Nov 2024 19:32:06 +0800 Subject: [PATCH 053/115] Update .gitignore to exclude additional scripts and files --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index b92a0b0d..de75f544 100644 --- a/.gitignore +++ b/.gitignore @@ -210,4 +210,6 @@ git_issues.md .docs/ .issues/ .gitboss/ -todo_executor.md \ No newline at end of file +todo_executor.md +protect-all-except-feature.sh +manage-collab.sh \ No newline at end of file From 2bdec1fa5a8d13f66598e15ff37d45ef75d5e830 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 19 Nov 2024 19:33:04 +0800 Subject: [PATCH 054/115] chore: add manage-collab.sh to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index da4b5f88..0fb09933 100644 --- a/.gitignore +++ b/.gitignore @@ -211,4 +211,5 @@ git_issues.md .docs/ .issues/ .gitboss/ + manage-collab.sh \ No newline at end of file From d418a04602ebe32d68d248a2995488beec768c61 Mon Sep 17 00:00:00 2001 From: Darwing Medina Date: Wed, 20 Nov 2024 04:52:11 -0600 Subject: [PATCH 055/115] Fix #260 prevent pass duplicated kwargs to scrapping_strategy (#269) Thank you for the suggestions. It totally makes sense now. Change to pop operator. --- crawl4ai/async_webcrawler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 38e429ca..fb8c5290 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -197,8 +197,8 @@ class AsyncWebCrawler: html, word_count_threshold=word_count_threshold, css_selector=css_selector, - only_text=kwargs.get("only_text", False), - image_description_min_word_threshold=kwargs.get( + only_text=kwargs.pop("only_text", False), + image_description_min_word_threshold=kwargs.pop( "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD ), **kwargs, From 3439f7886d170e05e0c97c804b1057187325c2a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A8=8B=E5=BA=8F=E5=91=98=E9=98=BF=E6=B1=9F=28Relakkes?= =?UTF-8?q?=29?= Date: Wed, 20 Nov 2024 20:30:25 +0800 Subject: [PATCH 056/115] fix: crawler strategy exception handling and fixes (#271) --- crawl4ai/crawler_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index ce802e49..898dcfa8 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -283,7 +283,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): print(f"[LOG] ✅ Crawled {url} successfully!") return html - except InvalidArgumentException: + except InvalidArgumentException as e: if not hasattr(e, 'msg'): e.msg = sanitize_input_encode(str(e)) raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}") From dbb751c8f09f76ffce4046784c2cd2b0021de7d0 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 21 Nov 2024 18:21:43 +0800 Subject: [PATCH 057/115] In this commit, we introduce the new concept of MakrdownGenerationStrategy, which allows us to expand our future strategies to generate better markdown. Right now, we generate raw markdown as we were doing before. We have a new algorithm for fitting markdown based on BM25, and now we add the ability to refine markdown into a citation form. Our links will be extracted and replaced by a citation reference number, and then we will have reference sections at the very end; we add all the links with the descriptions. This format is more suitable for large language models. In case we don't need to pass links, we can reduce the size of the markdown significantly and also attach the list of references as a separate file to a large language model. This commit contains changes for this direction. --- crawl4ai/__init__.py | 1 + crawl4ai/async_crawler_strategy.py | 13 +- crawl4ai/async_database.3.73.py | 285 --------------- crawl4ai/async_webcrawler.3.73.py | 344 ------------------ crawl4ai/async_webcrawler.py | 9 +- ...rategy.py => content_scraping_strategy.py} | 229 ++++++------ crawl4ai/markdown_generation_strategy.py | 115 ++++++ crawl4ai/models.py | 13 +- crawl4ai/utils.py | 88 +++++ crawl4ai/web_crawler.py | 2 +- tests/async/test_content_scraper_strategy.py | 4 +- tests/async/test_markdown_genertor.py | 165 +++++++++ 12 files changed, 506 insertions(+), 762 deletions(-) delete mode 100644 crawl4ai/async_database.3.73.py delete mode 100644 crawl4ai/async_webcrawler.3.73.py rename crawl4ai/{content_scrapping_strategy.py => content_scraping_strategy.py} (84%) create mode 100644 crawl4ai/markdown_generation_strategy.py create mode 100644 tests/async/test_markdown_genertor.py diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index ad9475b4..0ccf13d8 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -1,6 +1,7 @@ # __init__.py from .async_webcrawler import AsyncWebCrawler, CacheMode + from .models import CrawlResult from .__version__ import __version__ # __version__ = "0.3.73" diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index e7dc9c54..3f332eb0 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -229,6 +229,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.headless = kwargs.get("headless", True) self.browser_type = kwargs.get("browser_type", "chromium") self.headers = kwargs.get("headers", {}) + self.cookies = kwargs.get("cookies", []) self.sessions = {} self.session_ttl = 1800 self.js_code = js_code @@ -295,6 +296,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Set up the default context if self.default_context: await self.default_context.set_extra_http_headers(self.headers) + if self.cookies: + await self.default_context.add_cookies(self.cookies) if self.accept_downloads: await self.default_context.set_default_timeout(60000) await self.default_context.set_default_navigation_timeout(60000) @@ -669,6 +672,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # downloads_path=self.downloads_path if self.accept_downloads else None ) await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) + if self.cookies: + await context.add_cookies(self.cookies) await context.set_extra_http_headers(self.headers) page = await context.new_page() self.sessions[session_id] = (context, page, time.time()) @@ -684,6 +689,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): proxy={"server": self.proxy} if self.proxy else None, accept_downloads=self.accept_downloads, ) + if self.cookies: + await context.add_cookies(self.cookies) await context.set_extra_http_headers(self.headers) if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False): @@ -828,7 +835,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): for js in js_code: await page.evaluate(js) - await page.wait_for_load_state('networkidle') + # await page.wait_for_timeout(100) + # Check for on execution event await self.execute_hook('on_execution_started', page) @@ -846,6 +854,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000)) except Exception as e: raise RuntimeError(f"Wait condition failed: {str(e)}") + + # if not wait_for and js_code: + # await page.wait_for_load_state('networkidle', timeout=5000) # Update image dimensions update_image_dimensions_js = """ diff --git a/crawl4ai/async_database.3.73.py b/crawl4ai/async_database.3.73.py deleted file mode 100644 index f86c7f1d..00000000 --- a/crawl4ai/async_database.3.73.py +++ /dev/null @@ -1,285 +0,0 @@ -import os -from pathlib import Path -import aiosqlite -import asyncio -from typing import Optional, Tuple, Dict -from contextlib import asynccontextmanager -import logging -import json # Added for serialization/deserialization -from .utils import ensure_content_dirs, generate_content_hash -import xxhash -import aiofiles -# Set up logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -DB_PATH = os.path.join(Path.home(), ".crawl4ai") -os.makedirs(DB_PATH, exist_ok=True) -DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") - -class AsyncDatabaseManager: - def __init__(self, pool_size: int = 10, max_retries: int = 3): - self.db_path = DB_PATH - self.content_paths = ensure_content_dirs(os.path.dirname(DB_PATH)) - self.pool_size = pool_size - self.max_retries = max_retries - self.connection_pool: Dict[int, aiosqlite.Connection] = {} - self.pool_lock = asyncio.Lock() - self.connection_semaphore = asyncio.Semaphore(pool_size) - - async def initialize(self): - """Initialize the database and connection pool""" - await self.ainit_db() - - async def cleanup(self): - """Cleanup connections when shutting down""" - async with self.pool_lock: - for conn in self.connection_pool.values(): - await conn.close() - self.connection_pool.clear() - - @asynccontextmanager - async def get_connection(self): - """Connection pool manager""" - async with self.connection_semaphore: - task_id = id(asyncio.current_task()) - try: - async with self.pool_lock: - if task_id not in self.connection_pool: - conn = await aiosqlite.connect( - self.db_path, - timeout=30.0 - ) - await conn.execute('PRAGMA journal_mode = WAL') - await conn.execute('PRAGMA busy_timeout = 5000') - self.connection_pool[task_id] = conn - - yield self.connection_pool[task_id] - - except Exception as e: - logger.error(f"Connection error: {e}") - raise - finally: - async with self.pool_lock: - if task_id in self.connection_pool: - await self.connection_pool[task_id].close() - del self.connection_pool[task_id] - - async def execute_with_retry(self, operation, *args): - """Execute database operations with retry logic""" - for attempt in range(self.max_retries): - try: - async with self.get_connection() as db: - result = await operation(db, *args) - await db.commit() - return result - except Exception as e: - if attempt == self.max_retries - 1: - logger.error(f"Operation failed after {self.max_retries} attempts: {e}") - raise - await asyncio.sleep(1 * (attempt + 1)) # Exponential backoff - - async def ainit_db(self): - """Initialize database schema""" - async def _init(db): - await db.execute(''' - CREATE TABLE IF NOT EXISTS crawled_data ( - url TEXT PRIMARY KEY, - html TEXT, - cleaned_html TEXT, - markdown TEXT, - extracted_content TEXT, - success BOOLEAN, - media TEXT DEFAULT "{}", - links TEXT DEFAULT "{}", - metadata TEXT DEFAULT "{}", - screenshot TEXT DEFAULT "", - response_headers TEXT DEFAULT "{}", - downloaded_files TEXT DEFAULT "{}" -- New column added - ) - ''') - - await self.execute_with_retry(_init) - await self.update_db_schema() - - async def update_db_schema(self): - """Update database schema if needed""" - async def _check_columns(db): - cursor = await db.execute("PRAGMA table_info(crawled_data)") - columns = await cursor.fetchall() - return [column[1] for column in columns] - - column_names = await self.execute_with_retry(_check_columns) - - # List of new columns to add - new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files'] - - for column in new_columns: - if column not in column_names: - await self.aalter_db_add_column(column) - - async def aalter_db_add_column(self, new_column: str): - """Add new column to the database""" - async def _alter(db): - if new_column == 'response_headers': - await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"') - else: - await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') - logger.info(f"Added column '{new_column}' to the database.") - - await self.execute_with_retry(_alter) - - - async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, bool, str, str, str, str]]: - """Retrieve cached URL data""" - async def _get(db): - async with db.execute( - ''' - SELECT url, html, cleaned_html, markdown, - extracted_content, success, media, links, - metadata, screenshot, response_headers, - downloaded_files - FROM crawled_data WHERE url = ? - ''', - (url,) - ) as cursor: - row = await cursor.fetchone() - if row: - # Load content from files using stored hashes - html = await self._load_content(row[1], 'html') if row[1] else "" - cleaned = await self._load_content(row[2], 'cleaned') if row[2] else "" - markdown = await self._load_content(row[3], 'markdown') if row[3] else "" - extracted = await self._load_content(row[4], 'extracted') if row[4] else "" - screenshot = await self._load_content(row[9], 'screenshots') if row[9] else "" - - return ( - row[0], # url - html or "", # Return empty string if file not found - cleaned or "", - markdown or "", - extracted or "", - row[5], # success - json.loads(row[6] or '{}'), # media - json.loads(row[7] or '{}'), # links - json.loads(row[8] or '{}'), # metadata - screenshot or "", - json.loads(row[10] or '{}'), # response_headers - json.loads(row[11] or '[]') # downloaded_files - ) - return None - - try: - return await self.execute_with_retry(_get) - except Exception as e: - logger.error(f"Error retrieving cached URL: {e}") - return None - - async def acache_url(self, url: str, html: str, cleaned_html: str, - markdown: str, extracted_content: str, success: bool, - media: str = "{}", links: str = "{}", - metadata: str = "{}", screenshot: str = "", - response_headers: str = "{}", downloaded_files: str = "[]"): - """Cache URL data with content stored in filesystem""" - - # Store content files and get hashes - html_hash = await self._store_content(html, 'html') - cleaned_hash = await self._store_content(cleaned_html, 'cleaned') - markdown_hash = await self._store_content(markdown, 'markdown') - extracted_hash = await self._store_content(extracted_content, 'extracted') - screenshot_hash = await self._store_content(screenshot, 'screenshots') - - async def _cache(db): - await db.execute(''' - INSERT INTO crawled_data ( - url, html, cleaned_html, markdown, - extracted_content, success, media, links, metadata, - screenshot, response_headers, downloaded_files - ) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - ON CONFLICT(url) DO UPDATE SET - html = excluded.html, - cleaned_html = excluded.cleaned_html, - markdown = excluded.markdown, - extracted_content = excluded.extracted_content, - success = excluded.success, - media = excluded.media, - links = excluded.links, - metadata = excluded.metadata, - screenshot = excluded.screenshot, - response_headers = excluded.response_headers, - downloaded_files = excluded.downloaded_files - ''', (url, html_hash, cleaned_hash, markdown_hash, extracted_hash, - success, media, links, metadata, screenshot_hash, - response_headers, downloaded_files)) - - try: - await self.execute_with_retry(_cache) - except Exception as e: - logger.error(f"Error caching URL: {e}") - - - - async def aget_total_count(self) -> int: - """Get total number of cached URLs""" - async def _count(db): - async with db.execute('SELECT COUNT(*) FROM crawled_data') as cursor: - result = await cursor.fetchone() - return result[0] if result else 0 - - try: - return await self.execute_with_retry(_count) - except Exception as e: - logger.error(f"Error getting total count: {e}") - return 0 - - async def aclear_db(self): - """Clear all data from the database""" - async def _clear(db): - await db.execute('DELETE FROM crawled_data') - - try: - await self.execute_with_retry(_clear) - except Exception as e: - logger.error(f"Error clearing database: {e}") - - async def aflush_db(self): - """Drop the entire table""" - async def _flush(db): - await db.execute('DROP TABLE IF EXISTS crawled_data') - - try: - await self.execute_with_retry(_flush) - except Exception as e: - logger.error(f"Error flushing database: {e}") - - - async def _store_content(self, content: str, content_type: str) -> str: - """Store content in filesystem and return hash""" - if not content: - return "" - - content_hash = generate_content_hash(content) - file_path = os.path.join(self.content_paths[content_type], content_hash) - - # Only write if file doesn't exist - if not os.path.exists(file_path): - async with aiofiles.open(file_path, 'w', encoding='utf-8') as f: - await f.write(content) - - return content_hash - - async def _load_content(self, content_hash: str, content_type: str) -> Optional[str]: - """Load content from filesystem by hash""" - if not content_hash: - return None - - file_path = os.path.join(self.content_paths[content_type], content_hash) - try: - async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: - return await f.read() - except: - logger.error(f"Failed to load content: {file_path}") - return None - -# Create a singleton instance -async_db_manager = AsyncDatabaseManager() diff --git a/crawl4ai/async_webcrawler.3.73.py b/crawl4ai/async_webcrawler.3.73.py deleted file mode 100644 index 03e7a393..00000000 --- a/crawl4ai/async_webcrawler.3.73.py +++ /dev/null @@ -1,344 +0,0 @@ -import os -import time -from pathlib import Path -from typing import Optional -import json -import asyncio -from .models import CrawlResult -from .async_database import async_db_manager -from .chunking_strategy import * -from .extraction_strategy import * -from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse -from .content_scrapping_strategy import WebScrapingStrategy -from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD -from .utils import ( - sanitize_input_encode, - InvalidCSSSelectorError, - format_html -) -from .__version__ import __version__ as crawl4ai_version - -class AsyncWebCrawler: - def __init__( - self, - crawler_strategy: Optional[AsyncCrawlerStrategy] = None, - always_by_pass_cache: bool = False, - base_directory: str = str(Path.home()), - **kwargs, - ): - self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( - **kwargs - ) - self.always_by_pass_cache = always_by_pass_cache - # self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") - self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai") - os.makedirs(self.crawl4ai_folder, exist_ok=True) - os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) - self.ready = False - self.verbose = kwargs.get("verbose", False) - - async def __aenter__(self): - await self.crawler_strategy.__aenter__() - await self.awarmup() - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb): - await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb) - - async def awarmup(self): - # Print a message for crawl4ai and its version - if self.verbose: - print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}") - print("[LOG] 🌤️ Warming up the AsyncWebCrawler") - # await async_db_manager.ainit_db() - # # await async_db_manager.initialize() - # await self.arun( - # url="https://google.com/", - # word_count_threshold=5, - # bypass_cache=False, - # verbose=False, - # ) - self.ready = True - if self.verbose: - print("[LOG] 🌞 AsyncWebCrawler is ready to crawl") - - async def arun( - self, - url: str, - word_count_threshold=MIN_WORD_THRESHOLD, - extraction_strategy: ExtractionStrategy = None, - chunking_strategy: ChunkingStrategy = RegexChunking(), - bypass_cache: bool = False, - css_selector: str = None, - screenshot: bool = False, - user_agent: str = None, - verbose=True, - disable_cache: bool = False, - no_cache_read: bool = False, - no_cache_write: bool = False, - **kwargs, - ) -> CrawlResult: - """ - Runs the crawler for a single source: URL (web, local file, or raw HTML). - - Args: - url (str): The URL to crawl. Supported prefixes: - - 'http://' or 'https://': Web URL to crawl. - - 'file://': Local file path to process. - - 'raw:': Raw HTML content to process. - ... [other existing parameters] - - Returns: - CrawlResult: The result of the crawling and processing. - """ - try: - if disable_cache: - bypass_cache = True - no_cache_read = True - no_cache_write = True - - extraction_strategy = extraction_strategy or NoExtractionStrategy() - extraction_strategy.verbose = verbose - if not isinstance(extraction_strategy, ExtractionStrategy): - raise ValueError("Unsupported extraction strategy") - if not isinstance(chunking_strategy, ChunkingStrategy): - raise ValueError("Unsupported chunking strategy") - - word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD) - - async_response: AsyncCrawlResponse = None - cached = None - screenshot_data = None - extracted_content = None - - is_web_url = url.startswith(('http://', 'https://')) - is_local_file = url.startswith("file://") - is_raw_html = url.startswith("raw:") - _url = url if not is_raw_html else "Raw HTML" - - start_time = time.perf_counter() - cached_result = None - if is_web_url and (not bypass_cache or not no_cache_read) and not self.always_by_pass_cache: - cached_result = await async_db_manager.aget_cached_url(url) - - if cached_result: - html = sanitize_input_encode(cached_result.html) - extracted_content = sanitize_input_encode(cached_result.extracted_content or "") - if screenshot: - screenshot_data = cached_result.screenshot - if not screenshot_data: - cached_result = None - if verbose: - print( - f"[LOG] 1️⃣ ✅ Page fetched (cache) for {_url}, success: {bool(html)}, time taken: {time.perf_counter() - start_time:.2f} seconds" - ) - - - if not cached or not html: - t1 = time.perf_counter() - - if user_agent: - self.crawler_strategy.update_user_agent(user_agent) - async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs) - html = sanitize_input_encode(async_response.html) - screenshot_data = async_response.screenshot - t2 = time.perf_counter() - if verbose: - print( - f"[LOG] 1️⃣ ✅ Page fetched (no-cache) for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds" - ) - - t1 = time.perf_counter() - crawl_result = await self.aprocess_html( - url=url, - html=html, - extracted_content=extracted_content, - word_count_threshold=word_count_threshold, - extraction_strategy=extraction_strategy, - chunking_strategy=chunking_strategy, - css_selector=css_selector, - screenshot=screenshot_data, - verbose=verbose, - is_cached=bool(cached), - async_response=async_response, - bypass_cache=bypass_cache, - is_web_url = is_web_url, - is_local_file = is_local_file, - is_raw_html = is_raw_html, - **kwargs, - ) - - if async_response: - crawl_result.status_code = async_response.status_code - crawl_result.response_headers = async_response.response_headers - crawl_result.downloaded_files = async_response.downloaded_files - else: - crawl_result.status_code = 200 - crawl_result.response_headers = cached_result.response_headers if cached_result else {} - - crawl_result.success = bool(html) - crawl_result.session_id = kwargs.get("session_id", None) - - if verbose: - print( - f"[LOG] 🔥 🚀 Crawling done for {_url}, success: {crawl_result.success}, time taken: {time.perf_counter() - start_time:.2f} seconds" - ) - - if not is_raw_html and not no_cache_write: - if not bool(cached_result) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: - await async_db_manager.acache_url(crawl_result) - - - return crawl_result - - except Exception as e: - if not hasattr(e, "msg"): - e.msg = str(e) - print(f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}") - return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}", success=False, error_message=e.msg) - - async def arun_many( - self, - urls: List[str], - word_count_threshold=MIN_WORD_THRESHOLD, - extraction_strategy: ExtractionStrategy = None, - chunking_strategy: ChunkingStrategy = RegexChunking(), - bypass_cache: bool = False, - css_selector: str = None, - screenshot: bool = False, - user_agent: str = None, - verbose=True, - **kwargs, - ) -> List[CrawlResult]: - """ - Runs the crawler for multiple sources: URLs (web, local files, or raw HTML). - - Args: - urls (List[str]): A list of URLs with supported prefixes: - - 'http://' or 'https://': Web URL to crawl. - - 'file://': Local file path to process. - - 'raw:': Raw HTML content to process. - ... [other existing parameters] - - Returns: - List[CrawlResult]: The results of the crawling and processing. - """ - semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed - semaphore = asyncio.Semaphore(semaphore_count) - - async def crawl_with_semaphore(url): - async with semaphore: - return await self.arun( - url, - word_count_threshold=word_count_threshold, - extraction_strategy=extraction_strategy, - chunking_strategy=chunking_strategy, - bypass_cache=bypass_cache, - css_selector=css_selector, - screenshot=screenshot, - user_agent=user_agent, - verbose=verbose, - **kwargs, - ) - - tasks = [crawl_with_semaphore(url) for url in urls] - results = await asyncio.gather(*tasks, return_exceptions=True) - return [result if not isinstance(result, Exception) else str(result) for result in results] - - async def aprocess_html( - self, - url: str, - html: str, - extracted_content: str, - word_count_threshold: int, - extraction_strategy: ExtractionStrategy, - chunking_strategy: ChunkingStrategy, - css_selector: str, - screenshot: str, - verbose: bool, - **kwargs, - ) -> CrawlResult: - t = time.perf_counter() - # Extract content from HTML - try: - _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" - t1 = time.perf_counter() - scrapping_strategy = WebScrapingStrategy() - # result = await scrapping_strategy.ascrap( - result = scrapping_strategy.scrap( - url, - html, - word_count_threshold=word_count_threshold, - css_selector=css_selector, - only_text=kwargs.get("only_text", False), - image_description_min_word_threshold=kwargs.get( - "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD - ), - **kwargs, - ) - - if result is None: - raise ValueError(f"Process HTML, Failed to extract content from the website: {url}") - except InvalidCSSSelectorError as e: - raise ValueError(str(e)) - except Exception as e: - raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") - - cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) - markdown = sanitize_input_encode(result.get("markdown", "")) - fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) - fit_html = sanitize_input_encode(result.get("fit_html", "")) - media = result.get("media", []) - links = result.get("links", []) - metadata = result.get("metadata", {}) - - if verbose: - print( - f"[LOG] 2️⃣ ✅ Scraping done for {_url}, success: True, time taken: {time.perf_counter() - t1:.2f} seconds" - ) - - if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy): - t1 = time.perf_counter() - # Check if extraction strategy is type of JsonCssExtractionStrategy - if isinstance(extraction_strategy, JsonCssExtractionStrategy) or isinstance(extraction_strategy, JsonCssExtractionStrategy): - extraction_strategy.verbose = verbose - extracted_content = extraction_strategy.run(url, [html]) - extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) - else: - sections = chunking_strategy.chunk(markdown) - extracted_content = extraction_strategy.run(url, sections) - extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) - if verbose: - print( - f"[LOG] 3️⃣ ✅ Extraction done for {_url}, time taken: {time.perf_counter() - t1:.2f} seconds" - ) - - screenshot = None if not screenshot else screenshot - - return CrawlResult( - url=url, - html=html, - cleaned_html=format_html(cleaned_html), - markdown=markdown, - fit_markdown=fit_markdown, - fit_html= fit_html, - media=media, - links=links, - metadata=metadata, - screenshot=screenshot, - extracted_content=extracted_content, - success=True, - error_message="", - ) - - async def aclear_cache(self): - # await async_db_manager.aclear_db() - await async_db_manager.cleanup() - - async def aflush_cache(self): - await async_db_manager.aflush_db() - - async def aget_cache_size(self): - return await async_db_manager.aget_total_count() - - diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 7d1814b6..2ff7ce0f 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -7,14 +7,14 @@ from pathlib import Path from typing import Optional, List, Union import json import asyncio -from .models import CrawlResult +from .models import CrawlResult, MarkdownGenerationResult from .async_database import async_db_manager from .chunking_strategy import * from .content_filter_strategy import * from .extraction_strategy import * from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode -from .content_scrapping_strategy import WebScrapingStrategy +from .content_scraping_strategy import WebScrapingStrategy from .async_logger import AsyncLogger from .config import ( @@ -476,7 +476,7 @@ class AsyncWebCrawler: html, word_count_threshold=word_count_threshold, css_selector=css_selector, - only_text=kwargs.get("only_text", False), + only_text=kwargs.pop("only_text", False), image_description_min_word_threshold=kwargs.get( "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD ), @@ -491,6 +491,8 @@ class AsyncWebCrawler: except Exception as e: raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") + markdown_v2: MarkdownGenerationResult = result.get("markdown_v2", None) + cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) markdown = sanitize_input_encode(result.get("markdown", "")) fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) @@ -542,6 +544,7 @@ class AsyncWebCrawler: url=url, html=html, cleaned_html=format_html(cleaned_html), + markdown_v2=markdown_v2, markdown=markdown, fit_markdown=fit_markdown, fit_html= fit_html, diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scraping_strategy.py similarity index 84% rename from crawl4ai/content_scrapping_strategy.py rename to crawl4ai/content_scraping_strategy.py index 0f470671..3823a78d 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -1,6 +1,6 @@ import re # Point 1: Pre-Compile Regular Expressions from abc import ABC, abstractmethod -from typing import Dict, Any +from typing import Dict, Any, Optional from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor import asyncio, requests, re, os @@ -10,103 +10,19 @@ from urllib.parse import urljoin from requests.exceptions import InvalidSchema # from .content_cleaning_strategy import ContentCleaningStrategy from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter - +from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerationStrategy +from .models import MarkdownGenerationResult from .utils import ( sanitize_input_encode, sanitize_html, extract_metadata, InvalidCSSSelectorError, - # CustomHTML2Text, + CustomHTML2Text, normalize_url, is_external_url ) -from .html2text import HTML2Text -class CustomHTML2Text(HTML2Text): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.inside_pre = False - self.inside_code = False - self.preserve_tags = set() # Set of tags to preserve - self.current_preserved_tag = None - self.preserved_content = [] - self.preserve_depth = 0 - - # Configuration options - self.skip_internal_links = False - self.single_line_break = False - self.mark_code = False - self.include_sup_sub = False - self.body_width = 0 - self.ignore_mailto_links = True - self.ignore_links = False - self.escape_backslash = False - self.escape_dot = False - self.escape_plus = False - self.escape_dash = False - self.escape_snob = False - - def update_params(self, **kwargs): - """Update parameters and set preserved tags.""" - for key, value in kwargs.items(): - if key == 'preserve_tags': - self.preserve_tags = set(value) - else: - setattr(self, key, value) - - def handle_tag(self, tag, attrs, start): - # Handle preserved tags - if tag in self.preserve_tags: - if start: - if self.preserve_depth == 0: - self.current_preserved_tag = tag - self.preserved_content = [] - # Format opening tag with attributes - attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None) - self.preserved_content.append(f'<{tag}{attr_str}>') - self.preserve_depth += 1 - return - else: - self.preserve_depth -= 1 - if self.preserve_depth == 0: - self.preserved_content.append(f'') - # Output the preserved HTML block with proper spacing - preserved_html = ''.join(self.preserved_content) - self.o('\n' + preserved_html + '\n') - self.current_preserved_tag = None - return - - # If we're inside a preserved tag, collect all content - if self.preserve_depth > 0: - if start: - # Format nested tags with attributes - attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None) - self.preserved_content.append(f'<{tag}{attr_str}>') - else: - self.preserved_content.append(f'') - return - - # Handle pre tags - if tag == 'pre': - if start: - self.o('```\n') - self.inside_pre = True - else: - self.o('\n```') - self.inside_pre = False - # elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: - # pass - else: - super().handle_tag(tag, attrs, start) - - def handle_data(self, data, entity_char=False): - """Override handle_data to capture content within preserved tags.""" - if self.preserve_depth > 0: - self.preserved_content.append(data) - return - super().handle_data(data, entity_char) - # Pre-compile regular expressions for Open Graph and Twitter metadata OG_REGEX = re.compile(r'^og:') TWITTER_REGEX = re.compile(r'^twitter:') @@ -164,6 +80,98 @@ class WebScrapingStrategy(ContentScrapingStrategy): async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: return await asyncio.to_thread(self._get_content_of_website_optimized, url, html, **kwargs) + + def _generate_markdown_content(self, + cleaned_html: str, + html: str, + url: str, + success: bool, + **kwargs) -> Dict[str, Any]: + """Generate markdown content using either new strategy or legacy method. + + Args: + cleaned_html: Sanitized HTML content + html: Original HTML content + url: Base URL of the page + success: Whether scraping was successful + **kwargs: Additional options including: + - markdown_generator: Optional[MarkdownGenerationStrategy] + - html2text: Dict[str, Any] options for HTML2Text + - content_filter: Optional[RelevantContentFilter] + - fit_markdown: bool + - fit_markdown_user_query: Optional[str] + - fit_markdown_bm25_threshold: float + + Returns: + Dict containing markdown content in various formats + """ + markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerationStrategy()) + + if markdown_generator: + try: + markdown_result = markdown_generator.generate_markdown( + cleaned_html=cleaned_html, + base_url=url, + html2text_options=kwargs.get('html2text', {}), + content_filter=kwargs.get('content_filter', None) + ) + + markdown_v2 = MarkdownGenerationResult( + raw_markdown=markdown_result.raw_markdown, + markdown_with_citations=markdown_result.markdown_with_citations, + references_markdown=markdown_result.references_markdown, + fit_markdown=markdown_result.fit_markdown + ) + + return { + 'markdown': markdown_result.raw_markdown, + 'fit_markdown': markdown_result.fit_markdown or "Set flag 'fit_markdown' to True to get cleaned HTML content.", + 'fit_html': kwargs.get('content_filter', None).filter_content(html) if kwargs.get('content_filter') else "Set flag 'fit_markdown' to True to get cleaned HTML content.", + 'markdown_v2': markdown_v2 + } + except Exception as e: + self._log('error', + message="Error using new markdown generation strategy: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) + markdown_generator = None + + # Legacy method + h = CustomHTML2Text() + h.update_params(**kwargs.get('html2text', {})) + markdown = h.handle(cleaned_html) + markdown = markdown.replace(' ```', '```') + + fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content." + fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content." + + if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False): + content_filter = kwargs.get('content_filter', None) + if not content_filter: + content_filter = BM25ContentFilter( + user_query=kwargs.get('fit_markdown_user_query', None), + bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) + ) + fit_html = content_filter.filter_content(html) + fit_html = '\n'.join('
    {}
    '.format(s) for s in fit_html) + fit_markdown = h.handle(fit_html) + + markdown_v2 = MarkdownGenerationResult( + raw_markdown=markdown, + markdown_with_citations=markdown, + references_markdown=markdown, + fit_markdown=fit_markdown + ) + + return { + 'markdown': markdown, + 'fit_markdown': fit_markdown, + 'fit_html': fit_html, + 'markdown_v2' : markdown_v2 + } + + def _get_content_of_website_optimized(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]: success = True if not html: @@ -242,8 +250,6 @@ class WebScrapingStrategy(ContentScrapingStrategy): #Score an image for it's usefulness def score_image_for_usefulness(img, base_url, index, images_count): - - image_height = img.get('height') height_value, height_unit = parse_dimension(image_height) image_width = img.get('width') @@ -282,7 +288,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): if not is_valid_image(img, img.parent, img.parent.get('class', [])): return None score = score_image_for_usefulness(img, url, index, total_images) - if score <= IMAGE_SCORE_THRESHOLD: + if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD): return None return { 'src': img.get('src', ''), @@ -545,41 +551,16 @@ class WebScrapingStrategy(ContentScrapingStrategy): cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ') - try: - h = CustomHTML2Text() - h.update_params(**kwargs.get('html2text', {})) - markdown = h.handle(cleaned_html) - except Exception as e: - if not h: - h = CustomHTML2Text() - self._log('error', - message="Error converting HTML to markdown: {error}", - tag="SCRAPE", - params={"error": str(e)} - ) - markdown = h.handle(sanitize_html(cleaned_html)) - markdown = markdown.replace(' ```', '```') - + markdown_content = self._generate_markdown_content( + cleaned_html=cleaned_html, + html=html, + url=url, + success=success, + **kwargs + ) - - fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content." - fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content." - if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False): - content_filter = kwargs.get('content_filter', None) - if not content_filter: - content_filter = BM25ContentFilter( - user_query= kwargs.get('fit_markdown_user_query', None), - bm25_threshold= kwargs.get('fit_markdown_bm25_threshold', 1.0) - ) - fit_html = content_filter.filter_content(html) - fit_html = '\n'.join('
    {}
    '.format(s) for s in fit_html) - fit_markdown = h.handle(fit_html) - - cleaned_html = sanitize_html(cleaned_html) return { - 'markdown': markdown, - 'fit_markdown': fit_markdown, - 'fit_html': fit_html, + **markdown_content, 'cleaned_html': cleaned_html, 'success': success, 'media': media, diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py new file mode 100644 index 00000000..1adb4c28 --- /dev/null +++ b/crawl4ai/markdown_generation_strategy.py @@ -0,0 +1,115 @@ +from abc import ABC, abstractmethod +from typing import Optional, Dict, Any, Tuple +from .models import MarkdownGenerationResult +from .utils import CustomHTML2Text +from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter +import re +from urllib.parse import urljoin + +# Pre-compile the regex pattern +LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)') + +class MarkdownGenerationStrategy(ABC): + """Abstract base class for markdown generation strategies.""" + + @abstractmethod + def generate_markdown(self, + cleaned_html: str, + base_url: str = "", + html2text_options: Optional[Dict[str, Any]] = None, + content_filter: Optional[RelevantContentFilter] = None, + citations: bool = True, + **kwargs) -> MarkdownGenerationResult: + """Generate markdown from cleaned HTML.""" + pass + +class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): + """Default implementation of markdown generation strategy.""" + + def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]: + link_map = {} + url_cache = {} # Cache for URL joins + parts = [] + last_end = 0 + counter = 1 + + for match in LINK_PATTERN.finditer(markdown): + parts.append(markdown[last_end:match.start()]) + text, url, title = match.groups() + + # Use cached URL if available, otherwise compute and cache + if base_url and not url.startswith(('http://', 'https://', 'mailto:')): + if url not in url_cache: + url_cache[url] = fast_urljoin(base_url, url) + url = url_cache[url] + + if url not in link_map: + desc = [] + if title: desc.append(title) + if text and text != title: desc.append(text) + link_map[url] = (counter, ": " + " - ".join(desc) if desc else "") + counter += 1 + + num = link_map[url][0] + parts.append(f"{text}⟨{num}⟩" if not match.group(0).startswith('!') else f"![{text}⟨{num}⟩]") + last_end = match.end() + + parts.append(markdown[last_end:]) + converted_text = ''.join(parts) + + # Pre-build reference strings + references = ["\n\n## References\n\n"] + references.extend( + f"⟨{num}⟩ {url}{desc}\n" + for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0]) + ) + + return converted_text, ''.join(references) + + def generate_markdown(self, + cleaned_html: str, + base_url: str = "", + html2text_options: Optional[Dict[str, Any]] = None, + content_filter: Optional[RelevantContentFilter] = None, + citations: bool = True, + **kwargs) -> MarkdownGenerationResult: + """Generate markdown with citations from cleaned HTML.""" + # Initialize HTML2Text with options + h = CustomHTML2Text() + if html2text_options: + h.update_params(**html2text_options) + + # Generate raw markdown + raw_markdown = h.handle(cleaned_html) + raw_markdown = raw_markdown.replace(' ```', '```') + + # Convert links to citations + if citations: + markdown_with_citations, references_markdown = self.convert_links_to_citations( + raw_markdown, base_url + ) + + # Generate fit markdown if content filter is provided + fit_markdown: Optional[str] = None + if content_filter: + filtered_html = content_filter.filter_content(cleaned_html) + filtered_html = '\n'.join('
    {}
    '.format(s) for s in filtered_html) + fit_markdown = h.handle(filtered_html) + + return MarkdownGenerationResult( + raw_markdown=raw_markdown, + markdown_with_citations=markdown_with_citations, + references_markdown=references_markdown, + fit_markdown=fit_markdown + ) + +def fast_urljoin(base: str, url: str) -> str: + """Fast URL joining for common cases.""" + if url.startswith(('http://', 'https://', 'mailto:', '//')): + return url + if url.startswith('/'): + # Handle absolute paths + if base.endswith('/'): + return base[:-1] + url + return base + url + return urljoin(base, url) \ No newline at end of file diff --git a/crawl4ai/models.py b/crawl4ai/models.py index cab4c45b..122434ad 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -1,5 +1,5 @@ from pydantic import BaseModel, HttpUrl -from typing import List, Dict, Optional, Callable, Awaitable +from typing import List, Dict, Optional, Callable, Awaitable, Union @@ -7,6 +7,12 @@ class UrlModel(BaseModel): url: HttpUrl forced: bool = False +class MarkdownGenerationResult(BaseModel): + raw_markdown: str + markdown_with_citations: str + references_markdown: str + fit_markdown: Optional[str] = None + class CrawlResult(BaseModel): url: str html: str @@ -16,7 +22,8 @@ class CrawlResult(BaseModel): links: Dict[str, List[Dict]] = {} downloaded_files: Optional[List[str]] = None screenshot: Optional[str] = None - markdown: Optional[str] = None + markdown: Optional[Union[str, MarkdownGenerationResult]] = None + markdown_v2: Optional[MarkdownGenerationResult] = None fit_markdown: Optional[str] = None fit_html: Optional[str] = None extracted_content: Optional[str] = None @@ -36,3 +43,5 @@ class AsyncCrawlResponse(BaseModel): class Config: arbitrary_types_allowed = True + + diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 9abc5784..b07562df 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -18,6 +18,94 @@ import hashlib from typing import Optional, Tuple, Dict, Any import xxhash + +from .html2text import HTML2Text +class CustomHTML2Text(HTML2Text): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.inside_pre = False + self.inside_code = False + self.preserve_tags = set() # Set of tags to preserve + self.current_preserved_tag = None + self.preserved_content = [] + self.preserve_depth = 0 + + # Configuration options + self.skip_internal_links = False + self.single_line_break = False + self.mark_code = False + self.include_sup_sub = False + self.body_width = 0 + self.ignore_mailto_links = True + self.ignore_links = False + self.escape_backslash = False + self.escape_dot = False + self.escape_plus = False + self.escape_dash = False + self.escape_snob = False + + def update_params(self, **kwargs): + """Update parameters and set preserved tags.""" + for key, value in kwargs.items(): + if key == 'preserve_tags': + self.preserve_tags = set(value) + else: + setattr(self, key, value) + + def handle_tag(self, tag, attrs, start): + # Handle preserved tags + if tag in self.preserve_tags: + if start: + if self.preserve_depth == 0: + self.current_preserved_tag = tag + self.preserved_content = [] + # Format opening tag with attributes + attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None) + self.preserved_content.append(f'<{tag}{attr_str}>') + self.preserve_depth += 1 + return + else: + self.preserve_depth -= 1 + if self.preserve_depth == 0: + self.preserved_content.append(f'') + # Output the preserved HTML block with proper spacing + preserved_html = ''.join(self.preserved_content) + self.o('\n' + preserved_html + '\n') + self.current_preserved_tag = None + return + + # If we're inside a preserved tag, collect all content + if self.preserve_depth > 0: + if start: + # Format nested tags with attributes + attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None) + self.preserved_content.append(f'<{tag}{attr_str}>') + else: + self.preserved_content.append(f'') + return + + # Handle pre tags + if tag == 'pre': + if start: + self.o('```\n') + self.inside_pre = True + else: + self.o('\n```') + self.inside_pre = False + # elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: + # pass + else: + super().handle_tag(tag, attrs, start) + + def handle_data(self, data, entity_char=False): + """Override handle_data to capture content within preserved tags.""" + if self.preserve_depth > 0: + self.preserved_content.append(data) + return + super().handle_data(data, entity_char) + + + class InvalidCSSSelectorError(Exception): pass diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 6cfef6f0..a32a988d 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -10,7 +10,7 @@ from .extraction_strategy import * from .crawler_strategy import * from typing import List from concurrent.futures import ThreadPoolExecutor -from .content_scrapping_strategy import WebScrapingStrategy +from .content_scraping_strategy import WebScrapingStrategy from .config import * import warnings import json diff --git a/tests/async/test_content_scraper_strategy.py b/tests/async/test_content_scraper_strategy.py index 5dfa6362..62c49148 100644 --- a/tests/async/test_content_scraper_strategy.py +++ b/tests/async/test_content_scraper_strategy.py @@ -13,8 +13,8 @@ parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__f sys.path.append(parent_dir) __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) -from crawl4ai.content_scrapping_strategy import WebScrapingStrategy -from crawl4ai.content_scrapping_strategy import WebScrapingStrategy as WebScrapingStrategyCurrent +from crawl4ai.content_scraping_strategy import WebScrapingStrategy +from crawl4ai.content_scraping_strategy import WebScrapingStrategy as WebScrapingStrategyCurrent # from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent @dataclass diff --git a/tests/async/test_markdown_genertor.py b/tests/async/test_markdown_genertor.py new file mode 100644 index 00000000..025a0318 --- /dev/null +++ b/tests/async/test_markdown_genertor.py @@ -0,0 +1,165 @@ +# ## Issue #236 +# - **Last Updated:** 2024-11-11 01:42:14 +# - **Title:** [user data crawling opens two windows, unable to control correct user browser](https://github.com/unclecode/crawl4ai/issues/236) +# - **State:** open + +import os, sys, time +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) +__location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) +import asyncio +import os +import time +from typing import Dict, Any +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerationStrategy + +# Get current directory +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + +def print_test_result(name: str, result: Dict[str, Any], execution_time: float): + """Helper function to print test results.""" + print(f"\n{'='*20} {name} {'='*20}") + print(f"Execution time: {execution_time:.4f} seconds") + + + # Save markdown to files + for key, content in result.items(): + if isinstance(content, str): + with open(__location__ + f"/output/{name.lower()}_{key}.md", "w") as f: + f.write(content) + + # # Print first few lines of each markdown version + # for key, content in result.items(): + # if isinstance(content, str): + # preview = '\n'.join(content.split('\n')[:3]) + # print(f"\n{key} (first 3 lines):") + # print(preview) + # print(f"Total length: {len(content)} characters") + +def test_basic_markdown_conversion(): + """Test basic markdown conversion with links.""" + with open(__location__ + "/data/wikipedia.html", "r") as f: + cleaned_html = f.read() + + generator = DefaultMarkdownGenerationStrategy() + + start_time = time.perf_counter() + result = generator.generate_markdown( + cleaned_html=cleaned_html, + base_url="https://en.wikipedia.org" + ) + execution_time = time.perf_counter() - start_time + + print_test_result("Basic Markdown Conversion", { + 'raw': result.raw_markdown, + 'with_citations': result.markdown_with_citations, + 'references': result.references_markdown + }, execution_time) + + # Basic assertions + assert result.raw_markdown, "Raw markdown should not be empty" + assert result.markdown_with_citations, "Markdown with citations should not be empty" + assert result.references_markdown, "References should not be empty" + assert "⟨" in result.markdown_with_citations, "Citations should use ⟨⟩ brackets" + assert "## References" in result.references_markdown, "Should contain references section" + +def test_relative_links(): + """Test handling of relative links with base URL.""" + markdown = """ + Here's a [relative link](/wiki/Apple) and an [absolute link](https://example.com). + Also an [image](/images/test.png) and another [page](/wiki/Banana). + """ + + generator = DefaultMarkdownGenerationStrategy() + result = generator.generate_markdown( + cleaned_html=markdown, + base_url="https://en.wikipedia.org" + ) + + assert "https://en.wikipedia.org/wiki/Apple" in result.references_markdown + assert "https://example.com" in result.references_markdown + assert "https://en.wikipedia.org/images/test.png" in result.references_markdown + +def test_duplicate_links(): + """Test handling of duplicate links.""" + markdown = """ + Here's a [link](/test) and another [link](/test) and a [different link](/other). + """ + + generator = DefaultMarkdownGenerationStrategy() + result = generator.generate_markdown( + cleaned_html=markdown, + base_url="https://example.com" + ) + + # Count citations in markdown + citations = result.markdown_with_citations.count("⟨1⟩") + assert citations == 2, "Same link should use same citation number" + +def test_link_descriptions(): + """Test handling of link titles and descriptions.""" + markdown = """ + Here's a [link with title](/test "Test Title") and a [link with description](/other) to test. + """ + + generator = DefaultMarkdownGenerationStrategy() + result = generator.generate_markdown( + cleaned_html=markdown, + base_url="https://example.com" + ) + + assert "Test Title" in result.references_markdown, "Link title should be in references" + assert "link with description" in result.references_markdown, "Link text should be in references" + +def test_performance_large_document(): + """Test performance with large document.""" + with open(__location__ + "/data/wikipedia.md", "r") as f: + markdown = f.read() + + # Test with multiple iterations + iterations = 5 + times = [] + + generator = DefaultMarkdownGenerationStrategy() + + for i in range(iterations): + start_time = time.perf_counter() + result = generator.generate_markdown( + cleaned_html=markdown, + base_url="https://en.wikipedia.org" + ) + end_time = time.perf_counter() + times.append(end_time - start_time) + + avg_time = sum(times) / len(times) + print(f"\n{'='*20} Performance Test {'='*20}") + print(f"Average execution time over {iterations} iterations: {avg_time:.4f} seconds") + print(f"Min time: {min(times):.4f} seconds") + print(f"Max time: {max(times):.4f} seconds") + +def test_image_links(): + """Test handling of image links.""" + markdown = """ + Here's an ![image](/image.png "Image Title") and another ![image](/other.jpg). + And a regular [link](/page). + """ + + generator = DefaultMarkdownGenerationStrategy() + result = generator.generate_markdown( + cleaned_html=markdown, + base_url="https://example.com" + ) + + assert "![" in result.markdown_with_citations, "Image markdown syntax should be preserved" + assert "Image Title" in result.references_markdown, "Image title should be in references" + +if __name__ == "__main__": + print("Running markdown generation strategy tests...") + + test_basic_markdown_conversion() + test_relative_links() + test_duplicate_links() + test_link_descriptions() + test_performance_large_document() + test_image_links() + \ No newline at end of file From 006bee4a5a50fed10496b701ecfea350be1b7888 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 22 Nov 2024 16:00:17 +0800 Subject: [PATCH 058/115] feat: enhance image processing capabilities - Enhanced image processing with srcset support and validation checks for better image selection. --- README.md | 4 +- crawl4ai/content_scraping_strategy.py | 145 ++++++++++++++++++++++++-- crawl4ai/tools.py | 34 ++++++ 3 files changed, 172 insertions(+), 11 deletions(-) create mode 100644 crawl4ai/tools.py diff --git a/README.md b/README.md index af0d6610..1d3063c7 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,10 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc ## New in 0.3.74 ✨ -- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster, often completing in under 100 milliseconds (excluding web fetch time)! +- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster! - 📥 **Download Manager:** Integrated file crawling and downloading capabilities, with full control over file management and tracking within the `CrawlResult` object. +- 📝 **Markdown Generation Strategy:** Flexible markdown generation system supporting custom strategies for different use cases and output formats. +- 🔗 **LLM-Friendly Citations:** Automatic conversion of links into numbered citations with organized reference lists, making content more digestible for large language models. - 🔎 **Markdown Filter:** Enhanced content extraction using BM25 algorithm to create cleaner markdown with only relevant webpage content. - 🗂️ **Local & Raw HTML:** Crawl local files (`file://`) and raw HTML strings (`raw:`) directly. - 🤖 **Browser Control:** Use your own browser setup for crawling, with persistent contexts and stealth integration to bypass anti-bot measures. diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 3823a78d..3b41ec82 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -19,9 +19,9 @@ from .utils import ( InvalidCSSSelectorError, CustomHTML2Text, normalize_url, - is_external_url - + is_external_url ) +from .tools import profile_and_time # Pre-compile regular expressions for Open Graph and Twitter metadata OG_REGEX = re.compile(r'^og:') @@ -234,7 +234,26 @@ class WebScrapingStrategy(ContentScrapingStrategy): return text_content return None - def process_image(img, url, index, total_images): + def process_image_old(img, url, index, total_images): + def parse_srcset(srcset_str): + """Parse srcset attribute into list of image URLs with their sizes.""" + if not srcset_str: + return [] + + sources = [] + # Split on http/https and filter empty strings + urls = [f"http{part}" for part in srcset_str.split("http") if part] + + for url in urls: + # Remove trailing comma and whitespace, then split to get width + url = url.strip().rstrip(',') + parts = url.rsplit(' ', 1) + img_url = parts[0].strip() + width = parts[1].rstrip('w') if len(parts) > 1 else None + sources.append({'url': img_url, 'width': width}) + + return sources + #Check if an image has valid display and inside undesired html elements def is_valid_image(img, parent, parent_classes): style = img.get('style', '') @@ -283,14 +302,14 @@ class WebScrapingStrategy(ContentScrapingStrategy): score+=1 return score - - if not is_valid_image(img, img.parent, img.parent.get('class', [])): return None + score = score_image_for_usefulness(img, url, index, total_images) if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD): return None - return { + + base_result = { 'src': img.get('src', ''), 'data-src': img.get('data-src', ''), 'alt': img.get('alt', ''), @@ -299,6 +318,109 @@ class WebScrapingStrategy(ContentScrapingStrategy): 'type': 'image' } + sources = [] + srcset = img.get('srcset', '') + if srcset: + sources = parse_srcset(srcset) + if sources: + return [dict(base_result, src=source['url'], width=source['width']) + for source in sources] + + return [base_result] # Always return a list + + def process_image(img, url, index, total_images): + parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w') + if ' ' in u else None} + for u in [f"http{p}" for p in s.split("http") if p]] + + # Constants for checks + classes_to_check = frozenset(['button', 'icon', 'logo']) + tags_to_check = frozenset(['button', 'input']) + + # Pre-fetch commonly used attributes + style = img.get('style', '') + alt = img.get('alt', '') + src = img.get('src', '') + data_src = img.get('data-src', '') + width = img.get('width') + height = img.get('height') + parent = img.parent + parent_classes = parent.get('class', []) + + # Quick validation checks + if ('display:none' in style or + parent.name in tags_to_check or + any(c in cls for c in parent_classes for cls in classes_to_check) or + any(c in src for c in classes_to_check) or + any(c in alt for c in classes_to_check)): + return None + + # Quick score calculation + score = 0 + if width and width.isdigit(): + width_val = int(width) + score += 1 if width_val > 150 else 0 + if height and height.isdigit(): + height_val = int(height) + score += 1 if height_val > 150 else 0 + if alt: + score += 1 + score += index/total_images < 0.5 + + image_format = '' + if "data:image/" in src: + image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0] + else: + image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0] + + if image_format in ('jpg', 'png', 'webp', 'avif'): + score += 1 + + if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD): + return None + + # Use set for deduplication + unique_urls = set() + image_variants = [] + + # Base image info template + base_info = { + 'alt': alt, + 'desc': find_closest_parent_with_useful_text(img), + 'score': score, + 'type': 'image' + } + + # Inline function for adding variants + def add_variant(src, width=None): + if src and not src.startswith('data:') and src not in unique_urls: + unique_urls.add(src) + image_variants.append({**base_info, 'src': src, 'width': width}) + + # Process all sources + add_variant(src) + add_variant(data_src) + + # Handle srcset and data-srcset in one pass + for attr in ('srcset', 'data-srcset'): + if value := img.get(attr): + for source in parse_srcset(value): + add_variant(source['url'], source['width']) + + # Quick picture element check + if picture := img.find_parent('picture'): + for source in picture.find_all('source'): + if srcset := source.get('srcset'): + for src in parse_srcset(srcset): + add_variant(src['url'], src['width']) + + # Framework-specific attributes in one pass + for attr, value in img.attrs.items(): + if attr.startswith('data-') and ('src' in attr or 'srcset' in attr) and 'http' in value: + add_variant(value) + + return image_variants if image_variants else None + def remove_unwanted_attributes(element, important_attrs, keep_data_attributes=False): attrs_to_remove = [] for attr in element.attrs: @@ -490,13 +612,16 @@ class WebScrapingStrategy(ContentScrapingStrategy): links['internal'] = list(internal_links_dict.values()) links['external'] = list(external_links_dict.values()) - # # Process images using ThreadPoolExecutor imgs = body.find_all('img') - with ThreadPoolExecutor() as executor: - image_results = list(executor.map(process_image, imgs, [url]*len(imgs), range(len(imgs)), [len(imgs)]*len(imgs))) - media['images'] = [result for result in image_results if result is not None] + # For test we use for loop instead of thread + media['images'] = [ + img for result in (process_image(img, url, i, len(imgs)) + for i, img in enumerate(imgs)) + if result is not None + for img in result + ] def flatten_nested_elements(node): if isinstance(node, NavigableString): diff --git a/crawl4ai/tools.py b/crawl4ai/tools.py new file mode 100644 index 00000000..ff36b53a --- /dev/null +++ b/crawl4ai/tools.py @@ -0,0 +1,34 @@ +import time +import cProfile +import pstats +from functools import wraps + +def profile_and_time(func): + @wraps(func) + def wrapper(self, *args, **kwargs): + # Start timer + start_time = time.perf_counter() + + # Setup profiler + profiler = cProfile.Profile() + profiler.enable() + + # Run function + result = func(self, *args, **kwargs) + + # Stop profiler + profiler.disable() + + # Calculate elapsed time + elapsed_time = time.perf_counter() - start_time + + # Print timing + print(f"[PROFILER] Scraping completed in {elapsed_time:.2f} seconds") + + # Print profiling stats + stats = pstats.Stats(profiler) + stats.sort_stats('cumulative') # Sort by cumulative time + stats.print_stats(20) # Print top 20 time-consuming functions + + return result + return wrapper \ No newline at end of file From 571dda6549da3c31a5f7566359585eefe9ad2867 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 22 Nov 2024 18:27:43 +0800 Subject: [PATCH 059/115] Update Redme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 1d3063c7..e3ced79e 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc - 📝 **Markdown Generation Strategy:** Flexible markdown generation system supporting custom strategies for different use cases and output formats. - 🔗 **LLM-Friendly Citations:** Automatic conversion of links into numbered citations with organized reference lists, making content more digestible for large language models. - 🔎 **Markdown Filter:** Enhanced content extraction using BM25 algorithm to create cleaner markdown with only relevant webpage content. +- 🖼️ **Enhanced Image Extraction:** Supports srcset, picture elements, and responsive images. - 🗂️ **Local & Raw HTML:** Crawl local files (`file://`) and raw HTML strings (`raw:`) directly. - 🤖 **Browser Control:** Use your own browser setup for crawling, with persistent contexts and stealth integration to bypass anti-bot measures. - ☁️ **API & Cache Boost:** CORS support, static file serving, and a new filesystem-based cache for blazing-fast performance. Fine-tune caching with the `CacheMode` enum (ENABLED, DISABLED, READ_ONLY, WRITE_ONLY, BYPASS) and the `always_bypass_cache` parameter. From 24ad2fe2ddc11250bdd90d42c127a85cbfdb8fd5 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 22 Nov 2024 18:47:17 +0800 Subject: [PATCH 060/115] feat: enhance Markdown generation to include fit_html attribute --- crawl4ai/content_scraping_strategy.py | 13 +++---------- crawl4ai/markdown_generation_strategy.py | 3 ++- crawl4ai/models.py | 1 + 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 3b41ec82..d4b901d2 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -109,25 +109,18 @@ class WebScrapingStrategy(ContentScrapingStrategy): if markdown_generator: try: - markdown_result = markdown_generator.generate_markdown( + markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( cleaned_html=cleaned_html, base_url=url, html2text_options=kwargs.get('html2text', {}), content_filter=kwargs.get('content_filter', None) ) - markdown_v2 = MarkdownGenerationResult( - raw_markdown=markdown_result.raw_markdown, - markdown_with_citations=markdown_result.markdown_with_citations, - references_markdown=markdown_result.references_markdown, - fit_markdown=markdown_result.fit_markdown - ) - return { 'markdown': markdown_result.raw_markdown, 'fit_markdown': markdown_result.fit_markdown or "Set flag 'fit_markdown' to True to get cleaned HTML content.", - 'fit_html': kwargs.get('content_filter', None).filter_content(html) if kwargs.get('content_filter') else "Set flag 'fit_markdown' to True to get cleaned HTML content.", - 'markdown_v2': markdown_v2 + 'fit_html': markdown_result.fit_html or "Set flag 'fit_markdown' to True to get cleaned HTML content.", + 'markdown_v2': markdown_result } except Exception as e: self._log('error', diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index 1adb4c28..7922c413 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -100,7 +100,8 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): raw_markdown=raw_markdown, markdown_with_citations=markdown_with_citations, references_markdown=references_markdown, - fit_markdown=fit_markdown + fit_markdown=fit_markdown, + fit_html=filtered_html ) def fast_urljoin(base: str, url: str) -> str: diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 122434ad..3a1b8bd1 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -12,6 +12,7 @@ class MarkdownGenerationResult(BaseModel): markdown_with_citations: str references_markdown: str fit_markdown: Optional[str] = None + fit_html: Optional[str] = None class CrawlResult(BaseModel): url: str From e02935dc5b1fee1734f12fb60145193c2b9f5645 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 22 Nov 2024 18:49:22 +0800 Subject: [PATCH 061/115] chore: update README to reflect new features and improvements in version 0.3.74 --- README.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index e3ced79e..b0f9fff9 100644 --- a/README.md +++ b/README.md @@ -13,18 +13,18 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc ## New in 0.3.74 ✨ -- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster! -- 📥 **Download Manager:** Integrated file crawling and downloading capabilities, with full control over file management and tracking within the `CrawlResult` object. -- 📝 **Markdown Generation Strategy:** Flexible markdown generation system supporting custom strategies for different use cases and output formats. -- 🔗 **LLM-Friendly Citations:** Automatic conversion of links into numbered citations with organized reference lists, making content more digestible for large language models. -- 🔎 **Markdown Filter:** Enhanced content extraction using BM25 algorithm to create cleaner markdown with only relevant webpage content. -- 🖼️ **Enhanced Image Extraction:** Supports srcset, picture elements, and responsive images. -- 🗂️ **Local & Raw HTML:** Crawl local files (`file://`) and raw HTML strings (`raw:`) directly. -- 🤖 **Browser Control:** Use your own browser setup for crawling, with persistent contexts and stealth integration to bypass anti-bot measures. -- ☁️ **API & Cache Boost:** CORS support, static file serving, and a new filesystem-based cache for blazing-fast performance. Fine-tune caching with the `CacheMode` enum (ENABLED, DISABLED, READ_ONLY, WRITE_ONLY, BYPASS) and the `always_bypass_cache` parameter. -- 🐳 **API Gateway:** Run Crawl4AI as a local or cloud API service, enabling cross-platform usage through a containerized server with secure token authentication via `CRAWL4AI_API_TOKEN`. -- 🛠️ **Database Improvements:** Enhanced database system for handling larger content sets with improved caching and faster performance. -- 🐛 **Squashed Bugs:** Fixed browser context issues in Docker, memory leaks, enhanced error handling, and improved HTML parsing. +🚀 **Blazing Fast Scraping**: Significantly improved scraping speed. +📥 **Download Manager**: Integrated file crawling, downloading, and tracking within `CrawlResult`. +📝 **Markdown Strategy**: Flexible system for custom markdown generation and formats. +🔗 **LLM-Friendly Citations**: Auto-converts links to numbered citations with reference lists. +🔎 **Markdown Filter**: BM25-based content extraction for cleaner, relevant markdown. +🖼️ **Image Extraction**: Supports `srcset`, `picture`, and responsive image formats. +🗂️ **Local/Raw HTML**: Crawl `file://` paths and raw HTML (`raw:`) directly. +🤖 **Browser Control**: Custom browser setups with stealth integration to bypass bots. +☁️ **API & Cache Boost**: CORS, static serving, and enhanced filesystem-based caching. +🐳 **API Gateway**: Run as an API service with secure token authentication. +🛠️ **Database Upgrades**: Optimized for larger content sets with faster caching. +🐛 **Bug Fixes**: Resolved browser context issues, memory leaks, and improved error handling. ## Try it Now! From 8dea3f470f5a496a30dada1eab1c3b23ee3560ca Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 22 Nov 2024 18:50:12 +0800 Subject: [PATCH 062/115] chore: update README to include new features and improvements for version 0.3.74 --- README.md | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index b0f9fff9..fa88a507 100644 --- a/README.md +++ b/README.md @@ -13,18 +13,19 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc ## New in 0.3.74 ✨ -🚀 **Blazing Fast Scraping**: Significantly improved scraping speed. -📥 **Download Manager**: Integrated file crawling, downloading, and tracking within `CrawlResult`. -📝 **Markdown Strategy**: Flexible system for custom markdown generation and formats. -🔗 **LLM-Friendly Citations**: Auto-converts links to numbered citations with reference lists. -🔎 **Markdown Filter**: BM25-based content extraction for cleaner, relevant markdown. -🖼️ **Image Extraction**: Supports `srcset`, `picture`, and responsive image formats. -🗂️ **Local/Raw HTML**: Crawl `file://` paths and raw HTML (`raw:`) directly. -🤖 **Browser Control**: Custom browser setups with stealth integration to bypass bots. -☁️ **API & Cache Boost**: CORS, static serving, and enhanced filesystem-based caching. -🐳 **API Gateway**: Run as an API service with secure token authentication. -🛠️ **Database Upgrades**: Optimized for larger content sets with faster caching. -🐛 **Bug Fixes**: Resolved browser context issues, memory leaks, and improved error handling. +- 🚀 **Blazing Fast Scraping**: Significantly improved scraping speed. +- 📥 **Download Manager**: Integrated file crawling, downloading, and tracking within `CrawlResult`. +- 📝 **Markdown Strategy**: Flexible system for custom markdown generation and formats. +- 🔗 **LLM-Friendly Citations**: Auto-converts links to numbered citations with reference lists. +- 🔎 **Markdown Filter**: BM25-based content extraction for cleaner, relevant markdown. +- 🖼️ **Image Extraction**: Supports `srcset`, `picture`, and responsive image formats. +- 🗂️ **Local/Raw HTML**: Crawl `file://` paths and raw HTML (`raw:`) directly. +- 🤖 **Browser Control**: Custom browser setups with stealth integration to bypass bots. +- ☁️ **API & Cache Boost**: CORS, static serving, and enhanced filesystem-based caching. +- 🐳 **API Gateway**: Run as an API service with secure token authentication. +- 🛠️ **Database Upgrades**: Optimized for larger content sets with faster caching. +- 🐛 **Bug Fixes**: Resolved browser context issues, memory leaks, and improved error handling. + ## Try it Now! From 0d0cef343842af2aa369423790e159620e717f6c Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 22 Nov 2024 20:14:58 +0800 Subject: [PATCH 063/115] feat: add enhanced markdown generation example with citations and file output --- docs/examples/v0.3.74.overview.py | 109 ++++++++++++++++++++---------- 1 file changed, 74 insertions(+), 35 deletions(-) diff --git a/docs/examples/v0.3.74.overview.py b/docs/examples/v0.3.74.overview.py index 00296740..362ae8fc 100644 --- a/docs/examples/v0.3.74.overview.py +++ b/docs/examples/v0.3.74.overview.py @@ -52,34 +52,7 @@ async def download_example(): else: print("\nNo files were downloaded") -# 2. Content Filtering with BM25 Example -async def content_filtering_example(): - """Example of using the new BM25 content filtering""" - async with AsyncWebCrawler(verbose=True) as crawler: - # Create filter with custom query for OpenAI's blog - content_filter = BM25ContentFilter( - # user_query="Investment and fundraising", - # user_query="Robotic", - bm25_threshold=1.0 - ) - - result = await crawler.arun( - url="https://techcrunch.com/", - content_filter=content_filter, - cache_mode=CacheMode.BYPASS - ) - - print(f"Filtered content: {len(result.fit_markdown)}") - print(f"Filtered content: {result.fit_markdown}") - - # Save html - with open(os.path.join(__data__, "techcrunch.html"), "w") as f: - f.write(result.fit_html) - - with open(os.path.join(__data__, "filtered_content.md"), "w") as f: - f.write(result.fit_markdown) - -# 3. Local File and Raw HTML Processing Example +# 2. Local File and Raw HTML Processing Example async def local_and_raw_html_example(): """Example of processing local files and raw HTML""" # Create a sample HTML file @@ -115,6 +88,68 @@ async def local_and_raw_html_example(): print("Local file content:", local_result.markdown) print("\nRaw HTML content:", raw_result.markdown) +# 3. Enhanced Markdown Generation Example +async def markdown_generation_example(): + """Example of enhanced markdown generation with citations and LLM-friendly features""" + async with AsyncWebCrawler(verbose=True) as crawler: + # Create a content filter (optional) + content_filter = BM25ContentFilter( + # user_query="History and cultivation", + bm25_threshold=1.0 + ) + + result = await crawler.arun( + url="https://en.wikipedia.org/wiki/Apple", + css_selector="main div#bodyContent", + content_filter=content_filter, + cache_mode=CacheMode.BYPASS + ) + + from crawl4ai import AsyncWebCrawler + from crawl4ai.content_filter_strategy import BM25ContentFilter + + result = await crawler.arun( + url="https://en.wikipedia.org/wiki/Apple", + css_selector="main div#bodyContent", + content_filter=BM25ContentFilter() + ) + print(result.markdown_v2.fit_markdown) + + print("\nMarkdown Generation Results:") + print(f"1. Original markdown length: {len(result.markdown)}") + print(f"2. New markdown versions (markdown_v2):") + print(f" - Raw markdown length: {len(result.markdown_v2.raw_markdown)}") + print(f" - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}") + print(f" - References section length: {len(result.markdown_v2.references_markdown)}") + if result.markdown_v2.fit_markdown: + print(f" - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}") + + # Save examples to files + output_dir = os.path.join(__data__, "markdown_examples") + os.makedirs(output_dir, exist_ok=True) + + # Save different versions + with open(os.path.join(output_dir, "1_raw_markdown.md"), "w") as f: + f.write(result.markdown_v2.raw_markdown) + + with open(os.path.join(output_dir, "2_citations_markdown.md"), "w") as f: + f.write(result.markdown_v2.markdown_with_citations) + + with open(os.path.join(output_dir, "3_references.md"), "w") as f: + f.write(result.markdown_v2.references_markdown) + + if result.markdown_v2.fit_markdown: + with open(os.path.join(output_dir, "4_filtered_markdown.md"), "w") as f: + f.write(result.markdown_v2.fit_markdown) + + print(f"\nMarkdown examples saved to: {output_dir}") + + # Show a sample of citations and references + print("\nSample of markdown with citations:") + print(result.markdown_v2.markdown_with_citations[:500] + "...\n") + print("Sample of references:") + print('\n'.join(result.markdown_v2.references_markdown.split('\n')[:10]) + "...") + # 4. Browser Management Example async def browser_management_example(): """Example of using enhanced browser management features""" @@ -208,9 +243,13 @@ async def api_example(): headers=headers ) as status_response: result = await status_response.json() - print(f"Task result: {result}") + print(f"Task status: {result['status']}") if result["status"] == "completed": + print("Task completed!") + print("Results:") + news = json.loads(result["results"][0]['extracted_content']) + print(json.dumps(news[:4], indent=2)) break else: await asyncio.sleep(1) @@ -220,15 +259,15 @@ async def main(): # print("Running Crawl4AI feature examples...") # print("\n1. Running Download Example:") - await download_example() + # await download_example() - # print("\n2. Running Content Filtering Example:") - await content_filtering_example() + # print("\n2. Running Markdown Generation Example:") + # await markdown_generation_example() - # print("\n3. Running Local and Raw HTML Example:") - await local_and_raw_html_example() + # # print("\n3. Running Local and Raw HTML Example:") + # await local_and_raw_html_example() - # print("\n4. Running Browser Management Example:") + # # print("\n4. Running Browser Management Example:") await browser_management_example() # print("\n5. Running API Example:") From d729aa7d5edf9dab069af06e0c4ade1ca997eef7 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 23 Nov 2024 18:00:32 +0800 Subject: [PATCH 064/115] refactor: Add group ID to for images extracted from srcset. --- crawl4ai/content_scraping_strategy.py | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index d4b901d2..70a43240 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -228,24 +228,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): return None def process_image_old(img, url, index, total_images): - def parse_srcset(srcset_str): - """Parse srcset attribute into list of image URLs with their sizes.""" - if not srcset_str: - return [] - - sources = [] - # Split on http/https and filter empty strings - urls = [f"http{part}" for part in srcset_str.split("http") if part] - - for url in urls: - # Remove trailing comma and whitespace, then split to get width - url = url.strip().rstrip(',') - parts = url.rsplit(' ', 1) - img_url = parts[0].strip() - width = parts[1].rstrip('w') if len(parts) > 1 else None - sources.append({'url': img_url, 'width': width}) - - return sources + #Check if an image has valid display and inside undesired html elements def is_valid_image(img, parent, parent_classes): @@ -376,12 +359,16 @@ class WebScrapingStrategy(ContentScrapingStrategy): unique_urls = set() image_variants = [] + # Generate a unique group ID for this set of variants + group_id = index + # Base image info template base_info = { 'alt': alt, 'desc': find_closest_parent_with_useful_text(img), 'score': score, - 'type': 'image' + 'type': 'image', + 'group_id': group_id # Group ID for this set of variants } # Inline function for adding variants From 829a1f7992703064084826e0ebfeed819988c6e7 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 23 Nov 2024 19:45:41 +0800 Subject: [PATCH 065/115] feat: update version to 0.3.741 and enhance content filtering with heuristic strategy. Fixing the issue that when the past HTML to BM25 content filter does not have any HTML elements. --- crawl4ai/__version__.py | 2 +- crawl4ai/content_filter_strategy.py | 188 +++++++++++++++++++++++++- crawl4ai/content_scraping_strategy.py | 8 +- 3 files changed, 189 insertions(+), 9 deletions(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 65ee6e73..05bfd336 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.74" \ No newline at end of file +__version__ = "0.3.741" \ No newline at end of file diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py index 88216f7f..e6891a3f 100644 --- a/crawl4ai/content_filter_strategy.py +++ b/crawl4ai/content_filter_strategy.py @@ -10,6 +10,13 @@ from abc import ABC, abstractmethod from snowballstemmer import stemmer + +# import regex +# def tokenize_text(text): +# # Regular expression to match words or CJK (Chinese, Japanese, Korean) characters +# pattern = r'\p{L}+|\p{N}+|[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}ー]|[\p{P}]' +# return regex.findall(pattern, text) + # from nltk.stem import PorterStemmer # ps = PorterStemmer() class RelevantContentFilter(ABC): @@ -57,9 +64,14 @@ class RelevantContentFilter(ABC): query_parts = [] # Title - if soup.title: - query_parts.append(soup.title.string) - elif soup.find('h1'): + try: + title = soup.title.string + if title: + query_parts.append(title) + except Exception: + pass + + if soup.find('h1'): query_parts.append(soup.find('h1').get_text()) # Meta tags @@ -81,7 +93,7 @@ class RelevantContentFilter(ABC): return ' '.join(filter(None, query_parts)) - def extract_text_chunks(self, body: Tag) -> List[Tuple[str, str]]: + def extract_text_chunks(self, body: Tag, min_word_threshold: int = None) -> List[Tuple[str, str]]: """ Extracts text chunks from a BeautifulSoup body element while preserving order. Returns list of tuples (text, tag_name) for classification. @@ -155,6 +167,9 @@ class RelevantContentFilter(ABC): if text: chunks.append((chunk_index, text, 'content', body)) + if min_word_threshold: + chunks = [chunk for chunk in chunks if len(chunk[1].split()) >= min_word_threshold] + return chunks @@ -274,15 +289,26 @@ class BM25ContentFilter(RelevantContentFilter): } self.stemmer = stemmer(language) - def filter_content(self, html: str) -> List[str]: + def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]: """Implements content filtering using BM25 algorithm with priority tag handling""" if not html or not isinstance(html, str): return [] soup = BeautifulSoup(html, 'lxml') + + # Check if body is present + if not soup.body: + # Wrap in body tag if missing + soup = BeautifulSoup(f'{html}', 'lxml') body = soup.find('body') - query = self.extract_page_query(soup.find('head'), body) - candidates = self.extract_text_chunks(body) + + query = self.extract_page_query(soup, body) + + if not query: + return [] + # return [self.clean_element(soup)] + + candidates = self.extract_text_chunks(body, min_word_threshold) if not candidates: return [] @@ -299,6 +325,10 @@ class BM25ContentFilter(RelevantContentFilter): for _, chunk, _, _ in candidates] tokenized_query = [self.stemmer.stemWord(word) for word in query.lower().split()] + # tokenized_corpus = [[self.stemmer.stemWord(word) for word in tokenize_text(chunk.lower())] + # for _, chunk, _, _ in candidates] + # tokenized_query = [self.stemmer.stemWord(word) for word in tokenize_text(query.lower())] + # Clean from stop words and noise tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus] tokenized_query = clean_tokens(tokenized_query) @@ -326,3 +356,147 @@ class BM25ContentFilter(RelevantContentFilter): selected_candidates.sort(key=lambda x: x[0]) return [self.clean_element(tag) for _, _, tag in selected_candidates] + + +class HeuristicContentFilter(RelevantContentFilter): + def __init__(self): + super().__init__() + # Weights for different heuristics + self.tag_weights = { + 'article': 10, + 'main': 8, + 'section': 5, + 'div': 3, + 'p': 2, + 'pre': 2, + 'code': 2, + 'blockquote': 2, + 'li': 1, + 'span': 1, + } + self.max_depth = 5 # Maximum depth from body to consider + + def filter_content(self, html: str) -> List[str]: + """Implements heuristic content filtering without relying on a query.""" + if not html or not isinstance(html, str): + return [] + + soup = BeautifulSoup(html, 'lxml') + + # Ensure there is a body tag + if not soup.body: + soup = BeautifulSoup(f'{html}', 'lxml') + body = soup.body + + # Extract candidate text chunks + candidates = self.extract_text_chunks(body) + + if not candidates: + return [] + + # Score each candidate + scored_candidates = [] + for index, text, tag_type, tag in candidates: + score = self.score_element(tag, text) + if score > 0: + scored_candidates.append((score, index, text, tag)) + + # Sort candidates by score and then by document order + scored_candidates.sort(key=lambda x: (-x[0], x[1])) + + # Extract the top candidates (e.g., top 5) + top_candidates = scored_candidates[:5] # Adjust the number as needed + + # Sort the top candidates back to their original document order + top_candidates.sort(key=lambda x: x[1]) + + # Clean and return the content + return [self.clean_element(tag) for _, _, _, tag in top_candidates] + + def score_element(self, tag: Tag, text: str) -> float: + """Compute a score for an element based on heuristics.""" + if not text or not tag: + return 0 + + # Exclude unwanted tags + if self.is_excluded(tag): + return 0 + + # Text density + text_length = len(text.strip()) + html_length = len(str(tag)) + text_density = text_length / html_length if html_length > 0 else 0 + + # Link density + link_text_length = sum(len(a.get_text().strip()) for a in tag.find_all('a')) + link_density = link_text_length / text_length if text_length > 0 else 0 + + # Tag weight + tag_weight = self.tag_weights.get(tag.name, 1) + + # Depth factor (prefer elements closer to the body tag) + depth = self.get_depth(tag) + depth_weight = max(self.max_depth - depth, 1) / self.max_depth + + # Compute the final score + score = (text_density * tag_weight * depth_weight) / (1 + link_density) + + return score + + def get_depth(self, tag: Tag) -> int: + """Compute the depth of the tag from the body tag.""" + depth = 0 + current = tag + while current and current != current.parent and current.name != 'body': + current = current.parent + depth += 1 + return depth + + def extract_text_chunks(self, body: Tag) -> List[Tuple[int, str, str, Tag]]: + """ + Extracts text chunks from the body element while preserving order. + Returns list of tuples (index, text, tag_type, tag) for scoring. + """ + chunks = [] + index = 0 + + def traverse(element): + nonlocal index + if isinstance(element, NavigableString): + return + if not isinstance(element, Tag): + return + if self.is_excluded(element): + return + # Only consider included tags + if element.name in self.included_tags: + text = element.get_text(separator=' ', strip=True) + if len(text.split()) >= self.min_word_count: + tag_type = 'header' if element.name in self.header_tags else 'content' + chunks.append((index, text, tag_type, element)) + index += 1 + # Do not traverse children of this element to prevent duplication + return + for child in element.children: + traverse(child) + + traverse(body) + return chunks + + def is_excluded(self, tag: Tag) -> bool: + """Determine if a tag should be excluded based on heuristics.""" + if tag.name in self.excluded_tags: + return True + class_id = ' '.join(filter(None, [ + ' '.join(tag.get('class', [])), + tag.get('id', '') + ])) + if self.negative_patterns.search(class_id): + return True + # Exclude tags with high link density (e.g., navigation menus) + text = tag.get_text(separator=' ', strip=True) + link_text_length = sum(len(a.get_text(strip=True)) for a in tag.find_all('a')) + text_length = len(text) + if text_length > 0 and (link_text_length / text_length) > 0.5: + return True + return False diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 70a43240..ea6a2ef8 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -9,7 +9,7 @@ from bs4 import element, NavigableString, Comment from urllib.parse import urljoin from requests.exceptions import InvalidSchema # from .content_cleaning_strategy import ContentCleaningStrategy -from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter +from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerationStrategy from .models import MarkdownGenerationResult from .utils import ( @@ -129,6 +129,12 @@ class WebScrapingStrategy(ContentScrapingStrategy): params={"error": str(e)} ) markdown_generator = None + return { + 'markdown': f"Error using new markdown generation strategy: {str(e)}", + 'fit_markdown': "Set flag 'fit_markdown' to True to get cleaned HTML content.", + 'fit_html': "Set flag 'fit_markdown' to True to get cleaned HTML content.", + 'markdown_v2': None + } # Legacy method h = CustomHTML2Text() From edad7b6a742249f324d3baba01095f93fc05912f Mon Sep 17 00:00:00 2001 From: unclecode Date: Sun, 24 Nov 2024 18:48:39 +0800 Subject: [PATCH 066/115] chore: remove Railway deployment configuration and related documentation --- deploy/railway/README.md | 19 - deploy/railway/button.json | 33 -- deploy/railway/railway.toml | 18 - pages/app.css | 131 ----- pages/app.js | 356 ------------ pages/index copy.html | 971 -------------------------------- pages/index.html | 73 --- pages/index_pooling.html | 425 -------------- pages/partial/footer.html | 36 -- pages/partial/how_to_guide.html | 174 ------ pages/partial/installation.html | 65 --- pages/partial/try_it.html | 217 ------- pages/tmp.html | 434 -------------- 13 files changed, 2952 deletions(-) delete mode 100644 deploy/railway/README.md delete mode 100644 deploy/railway/button.json delete mode 100644 deploy/railway/railway.toml delete mode 100644 pages/app.css delete mode 100644 pages/app.js delete mode 100644 pages/index copy.html delete mode 100644 pages/index.html delete mode 100644 pages/index_pooling.html delete mode 100644 pages/partial/footer.html delete mode 100644 pages/partial/how_to_guide.html delete mode 100644 pages/partial/installation.html delete mode 100644 pages/partial/try_it.html delete mode 100644 pages/tmp.html diff --git a/deploy/railway/README.md b/deploy/railway/README.md deleted file mode 100644 index 155e7642..00000000 --- a/deploy/railway/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# Railway Deployment - -## Quick Deploy -[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/template/crawl4ai) - -## Manual Setup -1. Fork this repository -2. Create a new Railway project -3. Configure environment variables: - - `INSTALL_TYPE`: basic or all - - `ENABLE_GPU`: true/false -4. Deploy! - -## Configuration -See `railway.toml` for: -- Memory limits -- Health checks -- Restart policies -- Scaling options \ No newline at end of file diff --git a/deploy/railway/button.json b/deploy/railway/button.json deleted file mode 100644 index 1fc52167..00000000 --- a/deploy/railway/button.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "name": "Crawl4AI", - "description": "LLM Friendly Web Crawler & Scraper", - "render": { - "dockerfile": { - "path": "Dockerfile" - } - }, - "env": [ - { - "key": "INSTALL_TYPE", - "description": "Installation type (basic/all)", - "default": "basic", - "required": true - }, - { - "key": "ENABLE_GPU", - "description": "Enable GPU support", - "default": "false", - "required": false - } - ], - "services": [ - { - "name": "web", - "dockerfile": "./Dockerfile", - "healthcheck": { - "path": "/health", - "port": 11235 - } - } - ] - } \ No newline at end of file diff --git a/deploy/railway/railway.toml b/deploy/railway/railway.toml deleted file mode 100644 index f24d8fab..00000000 --- a/deploy/railway/railway.toml +++ /dev/null @@ -1,18 +0,0 @@ -# railway.toml -[build] -builder = "DOCKERFILE" -dockerfilePath = "Dockerfile" - -[deploy] -startCommand = "uvicorn main:app --host 0.0.0.0 --port $PORT" -healthcheckPath = "/health" -restartPolicyType = "ON_FAILURE" -restartPolicyMaxRetries = 3 - -[deploy.memory] -soft = 2048 # 2GB min for Playwright -hard = 4096 # 4GB max - -[deploy.scaling] -min = 1 -max = 1 diff --git a/pages/app.css b/pages/app.css deleted file mode 100644 index 0e94a2e5..00000000 --- a/pages/app.css +++ /dev/null @@ -1,131 +0,0 @@ -:root { - --ifm-font-size-base: 100%; - --ifm-line-height-base: 1.65; - --ifm-font-family-base: system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans, sans-serif, - BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", - "Segoe UI Symbol"; -} -html { - -webkit-font-smoothing: antialiased; - -webkit-text-size-adjust: 100%; - text-size-adjust: 100%; - font: var(--ifm-font-size-base) / var(--ifm-line-height-base) var(--ifm-font-family-base); -} -body { - background-color: #1a202c; - color: #fff; -} -.tab-content { - max-height: 400px; - overflow: auto; -} -pre { - white-space: pre-wrap; - font-size: 14px; -} -pre code { - width: 100%; -} - -/* Custom styling for docs-item class and Markdown generated elements */ -.docs-item { - background-color: #2d3748; /* bg-gray-800 */ - padding: 1rem; /* p-4 */ - border-radius: 0.375rem; /* rounded */ - box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); /* shadow-md */ - margin-bottom: 1rem; /* space between items */ - line-height: 1.5; /* leading-normal */ -} - -.docs-item h3, -.docs-item h4 { - color: #ffffff; /* text-white */ - font-size: 1.25rem; /* text-xl */ - font-weight: 700; /* font-bold */ - margin-bottom: 0.5rem; /* mb-2 */ -} -.docs-item h4 { - font-size: 1rem; /* text-xl */ -} - -.docs-item p { - color: #e2e8f0; /* text-gray-300 */ - margin-bottom: 0.5rem; /* mb-2 */ -} - -.docs-item code { - background-color: #1a202c; /* bg-gray-900 */ - color: #e2e8f0; /* text-gray-300 */ - padding: 0.25rem 0.5rem; /* px-2 py-1 */ - border-radius: 0.25rem; /* rounded */ - font-size: 0.875rem; /* text-sm */ -} - -.docs-item pre { - background-color: #1a202c; /* bg-gray-900 */ - color: #e2e8f0; /* text-gray-300 */ - padding: 0.5rem; /* p-2 */ - border-radius: 0.375rem; /* rounded */ - overflow: auto; /* overflow-auto */ - margin-bottom: 0.5rem; /* mb-2 */ -} - -.docs-item div { - color: #e2e8f0; /* text-gray-300 */ - font-size: 1rem; /* prose prose-sm */ - line-height: 1.25rem; /* line-height for readability */ -} - -/* Adjustments to make prose class more suitable for dark mode */ -.prose { - max-width: none; /* max-w-none */ -} - -.prose p, -.prose ul { - margin-bottom: 1rem; /* mb-4 */ -} - -.prose code { - /* background-color: #4a5568; */ /* bg-gray-700 */ - color: #65a30d; /* text-white */ - padding: 0.25rem 0.5rem; /* px-1 py-0.5 */ - border-radius: 0.25rem; /* rounded */ - display: inline-block; /* inline-block */ -} - -.prose pre { - background-color: #1a202c; /* bg-gray-900 */ - color: #ffffff; /* text-white */ - padding: 0.5rem; /* p-2 */ - border-radius: 0.375rem; /* rounded */ -} - -.prose h3 { - color: #65a30d; /* text-white */ - font-size: 1.25rem; /* text-xl */ - font-weight: 700; /* font-bold */ - margin-bottom: 0.5rem; /* mb-2 */ -} - -body { - background-color: #1a1a1a; - color: #b3ff00; -} -.sidebar { - color: #b3ff00; - border-right: 1px solid #333; -} -.sidebar a { - color: #b3ff00; - text-decoration: none; -} -.sidebar a:hover { - background-color: #555; -} -.content-section { - display: none; -} -.content-section.active { - display: block; -} diff --git a/pages/app.js b/pages/app.js deleted file mode 100644 index 098008ab..00000000 --- a/pages/app.js +++ /dev/null @@ -1,356 +0,0 @@ -// JavaScript to manage dynamic form changes and logic -document.getElementById("extraction-strategy-select").addEventListener("change", function () { - const strategy = this.value; - const providerModelSelect = document.getElementById("provider-model-select"); - const tokenInput = document.getElementById("token-input"); - const instruction = document.getElementById("instruction"); - const semantic_filter = document.getElementById("semantic_filter"); - const instruction_div = document.getElementById("instruction_div"); - const semantic_filter_div = document.getElementById("semantic_filter_div"); - const llm_settings = document.getElementById("llm_settings"); - - if (strategy === "LLMExtractionStrategy") { - // providerModelSelect.disabled = false; - // tokenInput.disabled = false; - // semantic_filter.disabled = true; - // instruction.disabled = false; - llm_settings.classList.remove("hidden"); - instruction_div.classList.remove("hidden"); - semantic_filter_div.classList.add("hidden"); - } else if (strategy === "NoExtractionStrategy") { - semantic_filter_div.classList.add("hidden"); - instruction_div.classList.add("hidden"); - llm_settings.classList.add("hidden"); - } else { - // providerModelSelect.disabled = true; - // tokenInput.disabled = true; - // semantic_filter.disabled = false; - // instruction.disabled = true; - llm_settings.classList.add("hidden"); - instruction_div.classList.add("hidden"); - semantic_filter_div.classList.remove("hidden"); - } - - -}); - -// Get the selected provider model and token from local storage -const storedProviderModel = localStorage.getItem("provider_model"); -const storedToken = localStorage.getItem(storedProviderModel); - -if (storedProviderModel) { - document.getElementById("provider-model-select").value = storedProviderModel; -} - -if (storedToken) { - document.getElementById("token-input").value = storedToken; -} - -// Handle provider model dropdown change -document.getElementById("provider-model-select").addEventListener("change", () => { - const selectedProviderModel = document.getElementById("provider-model-select").value; - const storedToken = localStorage.getItem(selectedProviderModel); - - if (storedToken) { - document.getElementById("token-input").value = storedToken; - } else { - document.getElementById("token-input").value = ""; - } -}); - -// Fetch total count from the database -axios - .get("/total-count") - .then((response) => { - document.getElementById("total-count").textContent = response.data.count; - }) - .catch((error) => console.error(error)); - -// Handle crawl button click -document.getElementById("crawl-btn").addEventListener("click", () => { - // validate input to have both URL and API token - // if selected extraction strategy is LLMExtractionStrategy, then API token is required - if (document.getElementById("extraction-strategy-select").value === "LLMExtractionStrategy") { - if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) { - alert("Please enter both URL(s) and API token."); - return; - } - } - - const selectedProviderModel = document.getElementById("provider-model-select").value; - const apiToken = document.getElementById("token-input").value; - const extractBlocks = document.getElementById("extract-blocks-checkbox").checked; - const bypassCache = document.getElementById("bypass-cache-checkbox").checked; - - // Save the selected provider model and token to local storage - localStorage.setItem("provider_model", selectedProviderModel); - localStorage.setItem(selectedProviderModel, apiToken); - - const urlsInput = document.getElementById("url-input").value; - const urls = urlsInput.split(",").map((url) => url.trim()); - const data = { - urls: urls, - include_raw_html: true, - bypass_cache: bypassCache, - extract_blocks: extractBlocks, - word_count_threshold: parseInt(document.getElementById("threshold").value), - extraction_strategy: document.getElementById("extraction-strategy-select").value, - extraction_strategy_args: { - provider: selectedProviderModel, - api_token: apiToken, - instruction: document.getElementById("instruction").value, - semantic_filter: document.getElementById("semantic_filter").value, - }, - chunking_strategy: document.getElementById("chunking-strategy-select").value, - chunking_strategy_args: {}, - css_selector: document.getElementById("css-selector").value, - screenshot: document.getElementById("screenshot-checkbox").checked, - // instruction: document.getElementById("instruction").value, - // semantic_filter: document.getElementById("semantic_filter").value, - verbose: true, - }; - - // import requests - - // data = { - // "urls": [ - // "https://www.nbcnews.com/business" - // ], - // "word_count_threshold": 10, - // "extraction_strategy": "NoExtractionStrategy", - // } - - // response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally - // print(response.json()) - - // save api token to local storage - localStorage.setItem("api_token", document.getElementById("token-input").value); - - document.getElementById("loading").classList.remove("hidden"); - document.getElementById("result").style.visibility = "hidden"; - document.getElementById("code_help").style.visibility = "hidden"; - - axios - .post("/crawl", data) - .then((response) => { - const result = response.data.results[0]; - const parsedJson = JSON.parse(result.extracted_content); - document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2); - document.getElementById("cleaned-html-result").textContent = result.cleaned_html; - document.getElementById("markdown-result").textContent = result.markdown; - document.getElementById("media-result").textContent = JSON.stringify( result.media, null, 2); - if (result.screenshot){ - const imgElement = document.createElement("img"); - // Set the src attribute with the base64 data - imgElement.src = `data:image/png;base64,${result.screenshot}`; - document.getElementById("screenshot-result").innerHTML = ""; - document.getElementById("screenshot-result").appendChild(imgElement); - } - - // Update code examples dynamically - const extractionStrategy = data.extraction_strategy; - const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy"; - - // REMOVE API TOKEN FROM CODE EXAMPLES - data.extraction_strategy_args.api_token = "your_api_token"; - - if (data.extraction_strategy === "NoExtractionStrategy") { - delete data.extraction_strategy_args; - delete data.extrac_blocks; - } - - if (data.chunking_strategy === "RegexChunking") { - delete data.chunking_strategy_args; - } - - delete data.verbose; - - if (data.css_selector === "") { - delete data.css_selector; - } - - if (!data.bypass_cache) { - delete data.bypass_cache; - } - - if (!data.extract_blocks) { - delete data.extract_blocks; - } - - if (!data.include_raw_html) { - delete data.include_raw_html; - } - - document.getElementById( - "curl-code" - ).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({ - ...data, - api_token: isLLMExtraction ? "your_api_token" : undefined, - }, null, 2)}' https://crawl4ai.com/crawl`; - - document.getElementById("python-code").textContent = `import requests\n\ndata = ${JSON.stringify( - { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined }, - null, - 2 - )}\n\nresponse = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally \nprint(response.json())`; - - document.getElementById( - "nodejs-code" - ).textContent = `const axios = require('axios');\n\nconst data = ${JSON.stringify( - { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined }, - null, - 2 - )};\n\naxios.post("https://crawl4ai.com/crawl", data) // OR local host if your run locally \n .then(response => console.log(response.data))\n .catch(error => console.error(error));`; - - document.getElementById( - "library-code" - ).textContent = `from crawl4ai.web_crawler import WebCrawler\nfrom crawl4ai.extraction_strategy import *\nfrom crawl4ai.chunking_strategy import *\n\ncrawler = WebCrawler()\ncrawler.warmup()\n\nresult = crawler.run(\n url='${ - urls[0] - }',\n word_count_threshold=${data.word_count_threshold},\n extraction_strategy=${ - isLLMExtraction - ? `${extractionStrategy}(provider="${data.provider_model}", api_token="${data.api_token}")` - : extractionStrategy + "()" - },\n chunking_strategy=${data.chunking_strategy}(),\n bypass_cache=${ - data.bypass_cache - },\n css_selector="${data.css_selector}"\n)\nprint(result)`; - - // Highlight code syntax - hljs.highlightAll(); - - // Select JSON tab by default - document.querySelector('.tab-btn[data-tab="json"]').click(); - - document.getElementById("loading").classList.add("hidden"); - - document.getElementById("result").style.visibility = "visible"; - document.getElementById("code_help").style.visibility = "visible"; - - // increment the total count - document.getElementById("total-count").textContent = - parseInt(document.getElementById("total-count").textContent) + 1; - }) - .catch((error) => { - console.error(error); - document.getElementById("loading").classList.add("hidden"); - }); -}); - -// Handle tab clicks -document.querySelectorAll(".tab-btn").forEach((btn) => { - btn.addEventListener("click", () => { - const tab = btn.dataset.tab; - document.querySelectorAll(".tab-btn").forEach((b) => b.classList.remove("bg-lime-700", "text-white")); - btn.classList.add("bg-lime-700", "text-white"); - document.querySelectorAll(".tab-content.code pre").forEach((el) => el.classList.add("hidden")); - document.getElementById(`${tab}-result`).parentElement.classList.remove("hidden"); - }); -}); - -// Handle code tab clicks -document.querySelectorAll(".code-tab-btn").forEach((btn) => { - btn.addEventListener("click", () => { - const tab = btn.dataset.tab; - document.querySelectorAll(".code-tab-btn").forEach((b) => b.classList.remove("bg-lime-700", "text-white")); - btn.classList.add("bg-lime-700", "text-white"); - document.querySelectorAll(".tab-content.result pre").forEach((el) => el.classList.add("hidden")); - document.getElementById(`${tab}-code`).parentElement.classList.remove("hidden"); - }); -}); - -// Handle copy to clipboard button clicks - -async function copyToClipboard(text) { - if (navigator.clipboard && navigator.clipboard.writeText) { - return navigator.clipboard.writeText(text); - } else { - return fallbackCopyTextToClipboard(text); - } -} - -function fallbackCopyTextToClipboard(text) { - return new Promise((resolve, reject) => { - const textArea = document.createElement("textarea"); - textArea.value = text; - - // Avoid scrolling to bottom - textArea.style.top = "0"; - textArea.style.left = "0"; - textArea.style.position = "fixed"; - - document.body.appendChild(textArea); - textArea.focus(); - textArea.select(); - - try { - const successful = document.execCommand("copy"); - if (successful) { - resolve(); - } else { - reject(); - } - } catch (err) { - reject(err); - } - - document.body.removeChild(textArea); - }); -} - -document.querySelectorAll(".copy-btn").forEach((btn) => { - btn.addEventListener("click", () => { - const target = btn.dataset.target; - const code = document.getElementById(target).textContent; - //navigator.clipboard.writeText(code).then(() => { - copyToClipboard(code).then(() => { - btn.textContent = "Copied!"; - setTimeout(() => { - btn.textContent = "Copy"; - }, 2000); - }); - }); -}); - -document.addEventListener("DOMContentLoaded", async () => { - try { - const extractionResponse = await fetch("/strategies/extraction"); - const extractionStrategies = await extractionResponse.json(); - - const chunkingResponse = await fetch("/strategies/chunking"); - const chunkingStrategies = await chunkingResponse.json(); - - renderStrategies("extraction-strategies", extractionStrategies); - renderStrategies("chunking-strategies", chunkingStrategies); - } catch (error) { - console.error("Error fetching strategies:", error); - } -}); - -function renderStrategies(containerId, strategies) { - const container = document.getElementById(containerId); - container.innerHTML = ""; // Clear any existing content - strategies = JSON.parse(strategies); - Object.entries(strategies).forEach(([strategy, description]) => { - const strategyElement = document.createElement("div"); - strategyElement.classList.add("bg-zinc-800", "p-4", "rounded", "shadow-md", "docs-item"); - - const strategyDescription = document.createElement("div"); - strategyDescription.classList.add("text-gray-300", "prose", "prose-sm"); - strategyDescription.innerHTML = marked.parse(description); - - strategyElement.appendChild(strategyDescription); - - container.appendChild(strategyElement); - }); -} -document.querySelectorAll(".sidebar a").forEach((link) => { - link.addEventListener("click", function (event) { - event.preventDefault(); - document.querySelectorAll(".content-section").forEach((section) => { - section.classList.remove("active"); - }); - const target = event.target.getAttribute("data-target"); - document.getElementById(target).classList.add("active"); - }); -}); -// Highlight code syntax -hljs.highlightAll(); diff --git a/pages/index copy.html b/pages/index copy.html deleted file mode 100644 index b61b7298..00000000 --- a/pages/index copy.html +++ /dev/null @@ -1,971 +0,0 @@ - - - - - - Crawl4AI - - - - - - - - - - - - - - - - -
    -
    -

    🔥🕷️ Crawl4AI: Web Data for your Thoughts

    -
    -
    - 📊 Total Website Processed - 2 -
    -
    - -
    -
    -

    Try It Now

    -
    -
    -
    - - -
    -
    - - -
    -
    - - -
    -
    - - -
    -
    - - -
    -
    - - -
    -
    - - -
    -
    -
    - - -
    -
    - - -
    - -
    -
    - -
    - -
    - - - -
    -
    -
    - - -
    -
    - -
    -
    - - - - -
    -
    -
    -                                
    -                                
    -                            
    - - - -
    -
    -
    -
    -
    -
    -
    - -
    - 🌟 Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! -
    -
    - First Step: Create an instance of WebCrawler and call the warmup() function. -
    -
    -
    crawler = WebCrawler()
    -            crawler.warmup()
    -
    - - -
    - 🧠 Understanding 'bypass_cache' and 'include_raw_html' parameters: -
    -
    First crawl (caches the result):
    -
    -
    result = crawler.run(url="https://www.nbcnews.com/business")
    -
    -
    Second crawl (Force to crawl again):
    -
    -
    result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
    -
    -
    Crawl result without raw HTML content:
    -
    -
    result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
    -
    - - -
    - 📄 - The 'include_raw_html' parameter, when set to True, includes the raw HTML content in the - response. By default, it is set to True. -
    -
    Set always_by_pass_cache to True:
    -
    -
    crawler.always_by_pass_cache = True
    -
    - - -
    - 🧩 Let's add a chunking strategy: RegexChunking! -
    -
    Using RegexChunking:
    -
    -
    result = crawler.run(
    -                url="https://www.nbcnews.com/business",
    -                chunking_strategy=RegexChunking(patterns=["\n\n"])
    -            )
    -
    -
    Using NlpSentenceChunking:
    -
    -
    result = crawler.run(
    -                url="https://www.nbcnews.com/business",
    -                chunking_strategy=NlpSentenceChunking()
    -            )
    -
    - - -
    - 🧠 Let's get smarter with an extraction strategy: CosineStrategy! -
    -
    Using CosineStrategy:
    -
    -
    result = crawler.run(
    -                url="https://www.nbcnews.com/business",
    -                extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3)
    -            )
    -
    - - -
    - 🤖 Time to bring in the big guns: LLMExtractionStrategy without instructions! -
    -
    Using LLMExtractionStrategy without instructions:
    -
    -
    result = crawler.run(
    -                url="https://www.nbcnews.com/business",
    -                extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
    -            )
    -
    - - -
    - 📜 Let's make it even more interesting: LLMExtractionStrategy with instructions! -
    -
    Using LLMExtractionStrategy with instructions:
    -
    -
    result = crawler.run(
    -                url="https://www.nbcnews.com/business",
    -                extraction_strategy=LLMExtractionStrategy(
    -                    provider="openai/gpt-4o",
    -                    api_token=os.getenv('OPENAI_API_KEY'),
    -                    instruction="I am interested in only financial news"
    -                )
    -            )
    -
    - - -
    - 🎯 Targeted extraction: Let's use a CSS selector to extract only H2 tags! -
    -
    Using CSS selector to extract H2 tags:
    -
    -
    result = crawler.run(
    -                url="https://www.nbcnews.com/business",
    -                css_selector="h2"
    -            )
    -
    - - -
    - 🖱️ Let's get interactive: Passing JavaScript code to click 'Load More' button! -
    -
    Using JavaScript to click 'Load More' button:
    -
    -
    js_code = """
    -            const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
    -            loadMoreButton && loadMoreButton.click();
    -            """
    -            crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
    -            crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
    -            result = crawler.run(url="https://www.nbcnews.com/business")
    -
    - - -
    - 🎉 - Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl - the web like a pro! 🕸️ -
    -
    -
    -
    -

    Installation 💻

    -

    - There are two ways to use Crawl4AI: as a library in your Python projects or as a standalone local - server. -

    - -

    - You can also try Crawl4AI in a Google Colab - Open In Colab -

    - -

    Using Crawl4AI as a Library 📚

    -

    To install Crawl4AI as a library, follow these steps:

    - -
      -
    1. - Install the package from GitHub: -
      pip install git+https://github.com/unclecode/crawl4ai.git
      -
    2. -
    3. - Alternatively, you can clone the repository and install the package locally: -
      virtualenv venv
      -source venv/bin/activate
      -git clone https://github.com/unclecode/crawl4ai.git
      -cd crawl4ai
      -pip install -e .
      -        
      -
    4. -
    5. - Import the necessary modules in your Python script: -
      from crawl4ai.web_crawler import WebCrawler
      -from crawl4ai.chunking_strategy import *
      -from crawl4ai.extraction_strategy import *
      -import os
      -
      -crawler = WebCrawler()
      -
      -# Single page crawl
      -single_url = UrlModel(url='https://www.nbcnews.com/business', forced=False)
      -result = crawl4ai.fetch_page(
      -    url='https://www.nbcnews.com/business',
      -    word_count_threshold=5, # Minimum word count for a HTML tag to be considered as a worthy block
      -    chunking_strategy= RegexChunking( patterns = ["\\n\\n"]), # Default is RegexChunking
      -    extraction_strategy= CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3) # Default is CosineStrategy
      -    # extraction_strategy= LLMExtractionStrategy(provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY')),
      -    bypass_cache=False,
      -    extract_blocks =True, # Whether to extract semantical blocks of text from the HTML
      -    css_selector = "", # Eg: "div.article-body"
      -    verbose=True,
      -    include_raw_html=True, # Whether to include the raw HTML content in the response
      -)
      -print(result.model_dump())
      -        
      -
    6. -
    -

    - For more information about how to run Crawl4AI as a local server, please refer to the - GitHub repository. -

    - -
    - -
    -

    📖 Parameters

    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    ParameterDescriptionRequiredDefault Value
    urls - A list of URLs to crawl and extract data from. - Yes-
    include_raw_html - Whether to include the raw HTML content in the response. - Nofalse
    bypass_cache - Whether to force a fresh crawl even if the URL has been previously crawled. - Nofalse
    extract_blocks - Whether to extract semantical blocks of text from the HTML. - Notrue
    word_count_threshold - The minimum number of words a block must contain to be considered meaningful (minimum - value is 5). - No5
    extraction_strategy - The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). - NoCosineStrategy
    chunking_strategy - The strategy to use for chunking the text before processing (e.g., "RegexChunking"). - NoRegexChunking
    css_selector - The CSS selector to target specific parts of the HTML for extraction. - NoNone
    verboseWhether to enable verbose logging.Notrue
    -
    -
    - -
    -
    -

    Extraction Strategies

    -
    -
    -
    - -
    -
    -

    Chunking Strategies

    -
    -
    -
    - -
    -
    -

    🤔 Why building this?

    -

    - In recent times, we've witnessed a surge of startups emerging, riding the AI hype wave and charging - for services that should rightfully be accessible to everyone. 🌍💸 One such example is scraping and - crawling web pages and transforming them into a format suitable for Large Language Models (LLMs). - 🕸️🤖 We believe that building a business around this is not the right approach; instead, it should - definitely be open-source. 🆓🌟 So, if you possess the skills to build such tools and share our - philosophy, we invite you to join our "Robinhood" band and help set these products free for the - benefit of all. 🤝💪 -

    -
    -
    - -
    -
    -

    ⚙️ Installation

    -

    - To install and run Crawl4AI as a library or a local server, please refer to the 📚 - GitHub repository. -

    -
    -
    - - - - - - diff --git a/pages/index.html b/pages/index.html deleted file mode 100644 index 2947c34a..00000000 --- a/pages/index.html +++ /dev/null @@ -1,73 +0,0 @@ - - - - - - Crawl4AI - - - - - - - - - - - - - - - -
    - -
    -

    🔥🕷️ Crawl4AI: Web Data for your Thoughts

    -
    -
    - 📊 Total Website Processed - 2 -
    -
    - - {{ try_it | safe }} - -
    -
    -
    - - - -
    - {{installation | safe}} {{how_to_guide | safe}} - -
    -

    Chunking Strategies

    -

    Content for chunking strategies...

    -
    -
    -

    Extraction Strategies

    -

    Content for extraction strategies...

    -
    -
    -
    -
    -
    - - {{ footer | safe }} - - - diff --git a/pages/index_pooling.html b/pages/index_pooling.html deleted file mode 100644 index 02128f84..00000000 --- a/pages/index_pooling.html +++ /dev/null @@ -1,425 +0,0 @@ - - - - - - Crawl4AI - - - - - - - - - - - - -
    -
    -

    🔥🕷️ Crawl4AI: Open-source LLM Friendly Web scraper

    -
    -
    - -
    -
    -

    Try It Now

    -
    - - - -
    - - -
    - -
    -
    - -
    -
    - - - -
    -
    -
    - - -
    -
    -
    -
    - - - -
    -
    -
    -                                    
    -                                    
    -                                
    - - -
    -
    -
    -
    -
    - -
    -
    -

    🤔 Why building this?

    -

    - In recent times, we've seen numerous startups emerging, riding the AI hype wave and charging for - services that should rightfully be accessible to everyone. 🌍💸 One for example is to scrap and crawl - a web page, and transform it o a form suitable for LLM. We don't think one should build a business - out of this, but definilty should be opened source. So if you possess the skills to build such things - and you have such philosphy you should join our "Robinhood" band and help set - these products free. 🆓🤝 -

    -
    -
    - -
    -
    -

    ⚙️ Installation

    -

    - To install and run Crawl4AI locally or on your own service, the best way is to use Docker. 🐳 Follow - these steps: -

    -
      -
    1. - Clone the GitHub repository: 📥 - git clone https://github.com/unclecode/crawl4ai.git -
    2. -
    3. Navigate to the project directory: 📂 cd crawl4ai
    4. -
    5. - Build the Docker image: 🛠️ docker build -t crawl4ai . On Mac, follow: 🍎 - docker build --platform linux/amd64 -t crawl4ai . -
    6. -
    7. Run the Docker container: ▶️ docker run -p 8000:80 crawl4ai
    8. -
    -

    - For more detailed instructions and advanced configuration options, please refer to the 📚 - GitHub repository. -

    -
    -
    - - - - - - diff --git a/pages/partial/footer.html b/pages/partial/footer.html deleted file mode 100644 index 3ab189e1..00000000 --- a/pages/partial/footer.html +++ /dev/null @@ -1,36 +0,0 @@ -
    -
    -

    🤔 Why building this?

    -

    - In recent times, we've witnessed a surge of startups emerging, riding the AI hype wave and charging - for services that should rightfully be accessible to everyone. 🌍💸 One such example is scraping and - crawling web pages and transforming them into a format suitable for Large Language Models (LLMs). - 🕸️🤖 We believe that building a business around this is not the right approach; instead, it should - definitely be open-source. 🆓🌟 So, if you possess the skills to build such tools and share our - philosophy, we invite you to join our "Robinhood" band and help set these products free for the - benefit of all. 🤝💪 -

    -
    -
    - - \ No newline at end of file diff --git a/pages/partial/how_to_guide.html b/pages/partial/how_to_guide.html deleted file mode 100644 index 785915c1..00000000 --- a/pages/partial/how_to_guide.html +++ /dev/null @@ -1,174 +0,0 @@ -
    -

    How to Guide

    -
    - -
    - 🌟 - Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling - fun! -
    -
    - First Step: Create an instance of WebCrawler and call the - warmup() function. -
    -
    -
    crawler = WebCrawler()
    -crawler.warmup()
    -
    - - -
    - 🧠 Understanding 'bypass_cache' and 'include_raw_html' parameters: -
    -
    First crawl (caches the result):
    -
    -
    result = crawler.run(url="https://www.nbcnews.com/business")
    -
    -
    Second crawl (Force to crawl again):
    -
    -
    result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
    -
    - ⚠️ Don't forget to set `bypass_cache` to True if you want to try different strategies for the same URL. Otherwise, the cached result will be returned. You can also set `always_by_pass_cache` in constructor to True to always bypass the cache. -
    -
    -
    Crawl result without raw HTML content:
    -
    -
    result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
    -
    - - -
    - 📄 - The 'include_raw_html' parameter, when set to True, includes the raw HTML content - in the response. By default, it is set to True. -
    -
    Set always_by_pass_cache to True:
    -
    -
    crawler.always_by_pass_cache = True
    -
    - -
    - 📸 - Let's take a screenshot of the page! -
    -
    -
    result = crawler.run(
    -    url="https://www.nbcnews.com/business",
    -    screenshot=True
    -)
    -with open("screenshot.png", "wb") as f:
    -    f.write(base64.b64decode(result.screenshot))
    -
    - - - -
    - 🧩 Let's add a chunking strategy: RegexChunking! -
    -
    Using RegexChunking:
    -
    -
    result = crawler.run(
    -    url="https://www.nbcnews.com/business",
    -    chunking_strategy=RegexChunking(patterns=["\n\n"])
    -)
    -
    -
    Using NlpSentenceChunking:
    -
    -
    result = crawler.run(
    -    url="https://www.nbcnews.com/business",
    -    chunking_strategy=NlpSentenceChunking()
    -)
    -
    - - -
    - 🧠 Let's get smarter with an extraction strategy: CosineStrategy! -
    -
    Using CosineStrategy:
    -
    -
    result = crawler.run(
    -    url="https://www.nbcnews.com/business",
    -    extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3)
    -)
    -
    - - -
    - 🤖 - Time to bring in the big guns: LLMExtractionStrategy without instructions! -
    -
    Using LLMExtractionStrategy without instructions:
    -
    -
    result = crawler.run(
    -    url="https://www.nbcnews.com/business",
    -    extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
    -)
    -
    - - -
    - 📜 - Let's make it even more interesting: LLMExtractionStrategy with - instructions! -
    -
    Using LLMExtractionStrategy with instructions:
    -
    -
    result = crawler.run(
    -    url="https://www.nbcnews.com/business",
    -    extraction_strategy=LLMExtractionStrategy(
    -    provider="openai/gpt-4o",
    -    api_token=os.getenv('OPENAI_API_KEY'),
    -    instruction="I am interested in only financial news"
    -)
    -)
    -
    - - -
    - 🎯 - Targeted extraction: Let's use a CSS selector to extract only H2 tags! -
    -
    Using CSS selector to extract H2 tags:
    -
    -
    result = crawler.run(
    -    url="https://www.nbcnews.com/business",
    -    css_selector="h2"
    -)
    -
    - - -
    - 🖱️ - Let's get interactive: Passing JavaScript code to click 'Load More' button! -
    -
    Using JavaScript to click 'Load More' button:
    -
    -
    js_code = ["""
    -const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
    -loadMoreButton && loadMoreButton.click();
    -"""]
    -crawler = WebCrawler(verbos=crawler_strategy, always_by_pass_cache=True)
    -result = crawler.run(url="https://www.nbcnews.com/business", js = js_code)
    -
    Remember that you can pass multiple JavaScript code snippets in the list. They all will be executed in the order they are passed.
    -
    - - -
    - 🎉 - Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth - and crawl the web like a pro! 🕸️ -
    -
    -
    \ No newline at end of file diff --git a/pages/partial/installation.html b/pages/partial/installation.html deleted file mode 100644 index 6a6561cd..00000000 --- a/pages/partial/installation.html +++ /dev/null @@ -1,65 +0,0 @@ -
    -

    Installation 💻

    -

    - There are three ways to use Crawl4AI: -

      -
    1. - As a library -
    2. -
    3. - As a local server (Docker) -
    4. -
    5. - As a Google Colab notebook. Open In Colab -
    6. -

      - - -

      To install Crawl4AI as a library, follow these steps:

      - -
        -
      1. - Install the package from GitHub: -
        virtualenv venv
        -source venv/bin/activate
        -pip install "crawl4ai[all] @ git+https://github.com/unclecode/crawl4ai.git"
        -            
        -
      2. -
      3. - Run the following command to load the required models. This is optional, but it will boost the performance and speed of the crawler. You need to do this only once. -
        crawl4ai-download-models
        -
      4. -
      5. - Alternatively, you can clone the repository and install the package locally: -
        virtualenv venv
        -source venv/bin/activate
        -git clone https://github.com/unclecode/crawl4ai.git
        -cd crawl4ai
        -pip install -e .[all]
        -
        -
      6. -
      7. - Use docker to run the local server: -
        docker build -t crawl4ai . 
        -# docker build --platform linux/amd64 -t crawl4ai . For Mac users
        -docker run -d -p 8000:80 crawl4ai
        -
      8. -
      -

      - For more information about how to run Crawl4AI as a local server, please refer to the - GitHub repository. -

      -
    \ No newline at end of file diff --git a/pages/partial/try_it.html b/pages/partial/try_it.html deleted file mode 100644 index e3033eec..00000000 --- a/pages/partial/try_it.html +++ /dev/null @@ -1,217 +0,0 @@ -
    -
    -

    Try It Now

    -
    -
    -
    - - -
    -
    -
    - - -
    -
    - - -
    -
    -
    -
    - - -
    -
    - - -
    -
    - -
    - - - -
    -
    -
    - - -
    -
    - - -
    - - -
    -
    - - -
    -
    - - - - - -
    -
    -
    - - - - -
    -
    - -
    -
    - - - - -
    -
    -
    -                        
    -                        
    -                    
    - - - -
    -
    -
    -
    -
    diff --git a/pages/tmp.html b/pages/tmp.html deleted file mode 100644 index 7c924676..00000000 --- a/pages/tmp.html +++ /dev/null @@ -1,434 +0,0 @@ -
    -
    -

    Installation 💻

    -

    There are three ways to use Crawl4AI:

    -
      -
    1. As a library
    2. -
    3. As a local server (Docker)
    4. -
    5. - As a Google Colab notebook. - Open In Colab -
    6. -

      - -

      To install Crawl4AI as a library, follow these steps:

      - -
        -
      1. - Install the package from GitHub: -
        pip install git+https://github.com/unclecode/crawl4ai.git
        -
      2. -
      3. - Alternatively, you can clone the repository and install the package locally: -
        virtualenv venv
        -source venv/bin/activate
        -git clone https://github.com/unclecode/crawl4ai.git
        -cd crawl4ai
        -pip install -e .
        -
        -
      4. -
      5. - Use docker to run the local server: -
        docker build -t crawl4ai . 
        -# docker build --platform linux/amd64 -t crawl4ai . For Mac users
        -docker run -d -p 8000:80 crawl4ai
        -
      6. -
      -

      - For more information about how to run Crawl4AI as a local server, please refer to the - GitHub repository. -

      -
    -
    -
    -

    How to Guide

    -
    - -
    - 🌟 - Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! -
    -
    - First Step: Create an instance of WebCrawler and call the - warmup() function. -
    -
    -
    crawler = WebCrawler()
    -crawler.warmup()
    -
    - - -
    - 🧠 Understanding 'bypass_cache' and 'include_raw_html' parameters: -
    -
    First crawl (caches the result):
    -
    -
    result = crawler.run(url="https://www.nbcnews.com/business")
    -
    -
    Second crawl (Force to crawl again):
    -
    -
    result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
    -
    - ⚠️ Don't forget to set `bypass_cache` to True if you want to try different strategies - for the same URL. Otherwise, the cached result will be returned. You can also set - `always_by_pass_cache` in constructor to True to always bypass the cache. -
    -
    -
    Crawl result without raw HTML content:
    -
    -
    result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
    -
    - - -
    - 📄 - The 'include_raw_html' parameter, when set to True, includes the raw HTML content in the response. - By default, it is set to True. -
    -
    Set always_by_pass_cache to True:
    -
    -
    crawler.always_by_pass_cache = True
    -
    - - -
    - 🧩 Let's add a chunking strategy: RegexChunking! -
    -
    Using RegexChunking:
    -
    -
    result = crawler.run(
    -url="https://www.nbcnews.com/business",
    -chunking_strategy=RegexChunking(patterns=["\n\n"])
    -)
    -
    -
    Using NlpSentenceChunking:
    -
    -
    result = crawler.run(
    -url="https://www.nbcnews.com/business",
    -chunking_strategy=NlpSentenceChunking()
    -)
    -
    - - -
    - 🧠 Let's get smarter with an extraction strategy: CosineStrategy! -
    -
    Using CosineStrategy:
    -
    -
    result = crawler.run(
    -url="https://www.nbcnews.com/business",
    -extraction_strategy=CosineStrategy(word_count_threshold=20, max_dist=0.2, linkage_method="ward", top_k=3)
    -)
    -
    - - -
    - 🤖 - Time to bring in the big guns: LLMExtractionStrategy without instructions! -
    -
    Using LLMExtractionStrategy without instructions:
    -
    -
    result = crawler.run(
    -url="https://www.nbcnews.com/business",
    -extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
    -)
    -
    - - -
    - 📜 - Let's make it even more interesting: LLMExtractionStrategy with instructions! -
    -
    Using LLMExtractionStrategy with instructions:
    -
    -
    result = crawler.run(
    -url="https://www.nbcnews.com/business",
    -extraction_strategy=LLMExtractionStrategy(
    -provider="openai/gpt-4o",
    -api_token=os.getenv('OPENAI_API_KEY'),
    -instruction="I am interested in only financial news"
    -)
    -)
    -
    - - -
    - 🎯 - Targeted extraction: Let's use a CSS selector to extract only H2 tags! -
    -
    Using CSS selector to extract H2 tags:
    -
    -
    result = crawler.run(
    -url="https://www.nbcnews.com/business",
    -css_selector="h2"
    -)
    -
    - - -
    - 🖱️ - Let's get interactive: Passing JavaScript code to click 'Load More' button! -
    -
    Using JavaScript to click 'Load More' button:
    -
    -
    js_code = """
    -const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
    -loadMoreButton && loadMoreButton.click();
    -"""
    -crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
    -crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
    -result = crawler.run(url="https://www.nbcnews.com/business")
    -
    - - -
    - 🎉 - Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the - web like a pro! 🕸️ -
    -
    -
    - -
    -
    -
    -

    RegexChunking

    -

    - RegexChunking is a text chunking strategy that splits a given text into smaller parts - using regular expressions. This is useful for preparing large texts for processing by language - models, ensuring they are divided into manageable segments. -

    -

    Constructor Parameters:

    -
      -
    • - patterns (list, optional): A list of regular expression patterns used to split the - text. Default is to split by double newlines (['\n\n']). -
    • -
    -

    Example usage:

    -
    chunker = RegexChunking(patterns=[r'\n\n', r'\. '])
    -chunks = chunker.chunk("This is a sample text. It will be split into chunks.")
    -
    -
    -
    -
    -
    -

    NlpSentenceChunking

    -

    - NlpSentenceChunking uses a natural language processing model to chunk a given text into - sentences. This approach leverages SpaCy to accurately split text based on sentence boundaries. -

    -

    Constructor Parameters:

    -
      -
    • - None. -
    • -
    -

    Example usage:

    -
    chunker = NlpSentenceChunking()
    -chunks = chunker.chunk("This is a sample text. It will be split into sentences.")
    -
    -
    -
    -
    -
    -

    TopicSegmentationChunking

    -

    - TopicSegmentationChunking uses the TextTiling algorithm to segment a given text into - topic-based chunks. This method identifies thematic boundaries in the text. -

    -

    Constructor Parameters:

    -
      -
    • - num_keywords (int, optional): The number of keywords to extract for each topic - segment. Default is 3. -
    • -
    -

    Example usage:

    -
    chunker = TopicSegmentationChunking(num_keywords=3)
    -chunks = chunker.chunk("This is a sample text. It will be split into topic-based segments.")
    -
    -
    -
    -
    -
    -

    FixedLengthWordChunking

    -

    - FixedLengthWordChunking splits a given text into chunks of fixed length, based on the - number of words. -

    -

    Constructor Parameters:

    -
      -
    • - chunk_size (int, optional): The number of words in each chunk. Default is - 100. -
    • -
    -

    Example usage:

    -
    chunker = FixedLengthWordChunking(chunk_size=100)
    -chunks = chunker.chunk("This is a sample text. It will be split into fixed-length word chunks.")
    -
    -
    -
    -
    -
    -

    SlidingWindowChunking

    -

    - SlidingWindowChunking uses a sliding window approach to chunk a given text. Each chunk - has a fixed length, and the window slides by a specified step size. -

    -

    Constructor Parameters:

    -
      -
    • - window_size (int, optional): The number of words in each chunk. Default is - 100. -
    • -
    • - step (int, optional): The number of words to slide the window. Default is - 50. -
    • -
    -

    Example usage:

    -
    chunker = SlidingWindowChunking(window_size=100, step=50)
    -chunks = chunker.chunk("This is a sample text. It will be split using a sliding window approach.")
    -
    -
    -
    -
    -
    -
    -
    -

    NoExtractionStrategy

    -

    - NoExtractionStrategy is a basic extraction strategy that returns the entire HTML - content without any modification. It is useful for cases where no specific extraction is required. - Only clean html, and amrkdown. -

    -

    Constructor Parameters:

    -

    None.

    -

    Example usage:

    -
    extractor = NoExtractionStrategy()
    -extracted_content = extractor.extract(url, html)
    -
    -
    -
    -
    -
    -

    LLMExtractionStrategy

    -

    - LLMExtractionStrategy uses a Language Model (LLM) to extract meaningful blocks or - chunks from the given HTML content. This strategy leverages an external provider for language model - completions. -

    -

    Constructor Parameters:

    -
      -
    • - provider (str, optional): The provider to use for the language model completions. - Default is DEFAULT_PROVIDER (e.g., openai/gpt-4). -
    • -
    • - api_token (str, optional): The API token for the provider. If not provided, it will - try to load from the environment variable OPENAI_API_KEY. -
    • -
    • - instruction (str, optional): An instruction to guide the LLM on how to perform the - extraction. This allows users to specify the type of data they are interested in or set the tone - of the response. Default is None. -
    • -
    -

    Example usage:

    -
    extractor = LLMExtractionStrategy(provider='openai', api_token='your_api_token', instruction='Extract only news about AI.')
    -extracted_content = extractor.extract(url, html)
    -
    -

    - By providing clear instructions, users can tailor the extraction process to their specific needs, - enhancing the relevance and utility of the extracted content. -

    -
    -
    -
    -
    -

    CosineStrategy

    -

    - CosineStrategy uses hierarchical clustering based on cosine similarity to extract - clusters of text from the given HTML content. This strategy is suitable for identifying related - content sections. -

    -

    Constructor Parameters:

    -
      -
    • - semantic_filter (str, optional): A string containing keywords for filtering relevant - documents before clustering. If provided, documents are filtered based on their cosine - similarity to the keyword filter embedding. Default is None. -
    • -
    • - word_count_threshold (int, optional): Minimum number of words per cluster. Default - is 20. -
    • -
    • - max_dist (float, optional): The maximum cophenetic distance on the dendrogram to - form clusters. Default is 0.2. -
    • -
    • - linkage_method (str, optional): The linkage method for hierarchical clustering. - Default is 'ward'. -
    • -
    • - top_k (int, optional): Number of top categories to extract. Default is - 3. -
    • -
    • - model_name (str, optional): The model name for embedding generation. Default is - 'BAAI/bge-small-en-v1.5'. -
    • -
    -

    Example usage:

    -
    extractor = CosineStrategy(semantic_filter='artificial intelligence', word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name='BAAI/bge-small-en-v1.5')
    -extracted_content = extractor.extract(url, html)
    -
    -

    Cosine Similarity Filtering

    -

    - When a semantic_filter is provided, the CosineStrategy applies an - embedding-based filtering process to select relevant documents before performing hierarchical - clustering. -

    -
    -
    -
    -
    -

    TopicExtractionStrategy

    -

    - TopicExtractionStrategy uses the TextTiling algorithm to segment the HTML content into - topics and extracts keywords for each segment. This strategy is useful for identifying and - summarizing thematic content. -

    -

    Constructor Parameters:

    -
      -
    • - num_keywords (int, optional): Number of keywords to represent each topic segment. - Default is 3. -
    • -
    -

    Example usage:

    -
    extractor = TopicExtractionStrategy(num_keywords=3)
    -extracted_content = extractor.extract(url, html)
    -
    -
    -
    -
    -
    From d7c5b900b8d5d965d56417ac94681e7a11bbb1ee Mon Sep 17 00:00:00 2001 From: unclecode Date: Sun, 24 Nov 2024 19:35:53 +0800 Subject: [PATCH 067/115] feat: add support for arm64 platform in Docker commands and update INSTALL_TYPE variable in docker-compose --- README.md | 9 +++++++++ docker-compose.yml | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fa88a507..6c5e256e 100644 --- a/README.md +++ b/README.md @@ -142,6 +142,9 @@ docker pull unclecode/crawl4ai:gpu # GPU-enabled version # Run the container docker run -p 11235:11235 unclecode/crawl4ai:basic # Replace 'basic' with your chosen version +# In case you want to set platform to arm64 +docker run --platform linux/arm64 -p 11235:11235 unclecode/crawl4ai:basic + # In case to allocate more shared memory for the container docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic ``` @@ -158,6 +161,12 @@ docker build -t crawl4ai:local \ --build-arg INSTALL_TYPE=basic \ # Options: basic, all . +# In case you want to set platform to arm64 +docker build -t crawl4ai:local \ + --build-arg INSTALL_TYPE=basic \ # Options: basic, all + --platform linux/arm64 \ + . + # Run your local build docker run -p 11235:11235 crawl4ai:local ``` diff --git a/docker-compose.yml b/docker-compose.yml index 1097ef11..b93beda9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,8 +4,8 @@ services: context: . dockerfile: Dockerfile args: - PYTHON_VERSION: 3.10 - INSTALL_TYPE: all + PYTHON_VERSION: "3.10" + INSTALL_TYPE: ${INSTALL_TYPE:-basic} ENABLE_GPU: false profiles: ["local"] ports: From de43505ae4177ddf671c8b765f2f55c28a740e47 Mon Sep 17 00:00:00 2001 From: unclecode Date: Sun, 24 Nov 2024 19:36:30 +0800 Subject: [PATCH 068/115] feat: update version to 0.3.742 --- crawl4ai/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 05bfd336..f06970ce 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.741" \ No newline at end of file +__version__ = "0.3.742" \ No newline at end of file From b09a86c0c1bc1036ff4954da991dfbccf65534cd Mon Sep 17 00:00:00 2001 From: unclecode Date: Sun, 24 Nov 2024 19:40:10 +0800 Subject: [PATCH 069/115] chore: remove deprecated Docker Compose configurations for crawl4ai service --- docker-compose.hub.yml | 27 --------------------------- docker-compose.local.yml | 33 --------------------------------- 2 files changed, 60 deletions(-) delete mode 100644 docker-compose.hub.yml delete mode 100644 docker-compose.local.yml diff --git a/docker-compose.hub.yml b/docker-compose.hub.yml deleted file mode 100644 index 9bcfa982..00000000 --- a/docker-compose.hub.yml +++ /dev/null @@ -1,27 +0,0 @@ -services: - crawl4ai: - image: unclecode/crawl4ai:basic # Pull image from Docker Hub - ports: - - "11235:11235" # FastAPI server - - "8000:8000" # Alternative port - - "9222:9222" # Browser debugging - - "8080:8080" # Additional port - environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token - - OPENAI_API_KEY=${OPENAI_API_KEY:-} # Optional OpenAI API key - - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} # Optional Claude API key - volumes: - - /dev/shm:/dev/shm # Shared memory for browser operations - deploy: - resources: - limits: - memory: 4G - reservations: - memory: 1G - restart: unless-stopped - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:11235/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s diff --git a/docker-compose.local.yml b/docker-compose.local.yml deleted file mode 100644 index 7dc41b47..00000000 --- a/docker-compose.local.yml +++ /dev/null @@ -1,33 +0,0 @@ -services: - crawl4ai: - build: - context: . - dockerfile: Dockerfile - args: - PYTHON_VERSION: 3.10 - INSTALL_TYPE: all - ENABLE_GPU: false - ports: - - "11235:11235" # FastAPI server - - "8000:8000" # Alternative port - - "9222:9222" # Browser debugging - - "8080:8080" # Additional port - environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token - - OPENAI_API_KEY=${OPENAI_API_KEY:-} # Optional OpenAI API key - - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} # Optional Claude API key - volumes: - - /dev/shm:/dev/shm # Shared memory for browser operations - deploy: - resources: - limits: - memory: 4G - reservations: - memory: 1G - restart: unless-stopped - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:11235/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s \ No newline at end of file From 195c0ccf8aa5e0462b97bc8a7f5cff608b69b53a Mon Sep 17 00:00:00 2001 From: unclecode Date: Sun, 24 Nov 2024 19:40:27 +0800 Subject: [PATCH 070/115] chore: remove deprecated Docker Compose configurations for crawl4ai service --- docker-compose.hub.yml | 27 --------------------------- docker-compose.local.yml | 33 --------------------------------- 2 files changed, 60 deletions(-) delete mode 100644 docker-compose.hub.yml delete mode 100644 docker-compose.local.yml diff --git a/docker-compose.hub.yml b/docker-compose.hub.yml deleted file mode 100644 index 9bcfa982..00000000 --- a/docker-compose.hub.yml +++ /dev/null @@ -1,27 +0,0 @@ -services: - crawl4ai: - image: unclecode/crawl4ai:basic # Pull image from Docker Hub - ports: - - "11235:11235" # FastAPI server - - "8000:8000" # Alternative port - - "9222:9222" # Browser debugging - - "8080:8080" # Additional port - environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token - - OPENAI_API_KEY=${OPENAI_API_KEY:-} # Optional OpenAI API key - - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} # Optional Claude API key - volumes: - - /dev/shm:/dev/shm # Shared memory for browser operations - deploy: - resources: - limits: - memory: 4G - reservations: - memory: 1G - restart: unless-stopped - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:11235/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s diff --git a/docker-compose.local.yml b/docker-compose.local.yml deleted file mode 100644 index 7dc41b47..00000000 --- a/docker-compose.local.yml +++ /dev/null @@ -1,33 +0,0 @@ -services: - crawl4ai: - build: - context: . - dockerfile: Dockerfile - args: - PYTHON_VERSION: 3.10 - INSTALL_TYPE: all - ENABLE_GPU: false - ports: - - "11235:11235" # FastAPI server - - "8000:8000" # Alternative port - - "9222:9222" # Browser debugging - - "8080:8080" # Additional port - environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token - - OPENAI_API_KEY=${OPENAI_API_KEY:-} # Optional OpenAI API key - - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} # Optional Claude API key - volumes: - - /dev/shm:/dev/shm # Shared memory for browser operations - deploy: - resources: - limits: - memory: 4G - reservations: - memory: 1G - restart: unless-stopped - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:11235/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s \ No newline at end of file From c6a022132b9fff4db14586a55c95f346ac3da5f7 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 27 Nov 2024 14:55:56 +0800 Subject: [PATCH 071/115] docs: update CONTRIBUTORS.md to acknowledge aadityakanjolia4 for fixing 'CustomHTML2Text' bug --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 0b5dcede..81e916cb 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -10,6 +10,7 @@ We would like to thank the following people for their contributions to Crawl4AI: ## Community Contributors +- [aadityakanjolia4](https://github.com/aadityakanjolia4) - Fixing 'CustomHTML2Text' is not defined bug in the code. - [FractalMind](https://github.com/FractalMind) - Created the first official Docker Hub image and fixed Dockerfile errors - [ketonkss4](https://github.com/ketonkss4) - Identified Selenium's new capabilities, helping reduce dependencies - [jonymusky](https://github.com/jonymusky) - Javascript execution documentation, and wait_for From 73661f7d1fd37111e34e4dc9ec10f87d5a5f3afe Mon Sep 17 00:00:00 2001 From: zhounan Date: Wed, 27 Nov 2024 15:04:20 +0800 Subject: [PATCH 072/115] docs: enhance development installation instructions (#286) Thanks for your contribution. I'm merging your changes and I'll add your name to our contributor list. Thank you so much. --- README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6c5e256e..5ba33dea 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,15 @@ For contributors who plan to modify the source code: ```bash git clone https://github.com/unclecode/crawl4ai.git cd crawl4ai -pip install -e . +pip install -e . # Basic installation in editable mode +``` +Install optional features: +```bash +pip install -e ".[torch]" # With PyTorch features +pip install -e ".[transformer]" # With Transformer features +pip install -e ".[cosine]" # With cosine similarity features +pip install -e ".[sync]" # With synchronous crawling (Selenium) +pip install -e ".[all]" # Install all optional features ``` ## One-Click Deployment 🚀 From f998e9e94906302a4ee32cd5e581f4fa7bd22021 Mon Sep 17 00:00:00 2001 From: Hamza Farhan Date: Wed, 27 Nov 2024 16:20:54 +0500 Subject: [PATCH 073/115] Fix: handled the cases where markdown_with_citations, references_markdown, and filtered_html might not be defined. (#293) Thanks, dear Farhan, for the changes you made in the code. I accepted and merged them into the main branch. Also, I will add your name to our contributor list. Thank you so much. --- crawl4ai/markdown_generation_strategy.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index 7922c413..249bc1ce 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -84,6 +84,8 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): raw_markdown = raw_markdown.replace(' ```', '```') # Convert links to citations + markdown_with_citations: str = "" + references_markdown: str = "" if citations: markdown_with_citations, references_markdown = self.convert_links_to_citations( raw_markdown, base_url @@ -91,9 +93,9 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): # Generate fit markdown if content filter is provided fit_markdown: Optional[str] = None + filtered_html: Optional[str] = None if content_filter: - filtered_html = content_filter.filter_content(cleaned_html) - filtered_html = '\n'.join('
    {}
    '.format(s) for s in filtered_html) + filtered_html = '\n'.join('
    {}
    '.format(s) for s in content_filter.filter_content(cleaned_html)) fit_markdown = h.handle(filtered_html) return MarkdownGenerationResult( @@ -101,7 +103,7 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): markdown_with_citations=markdown_with_citations, references_markdown=references_markdown, fit_markdown=fit_markdown, - fit_html=filtered_html + fit_html=filtered_html, ) def fast_urljoin(base: str, url: str) -> str: From 24723b2f100ed25747b1b84a833f82e17340b457 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 12:45:05 +0800 Subject: [PATCH 074/115] Enhance features and documentation - Updated version to 0.3.743 - Improved ManagedBrowser configuration with dynamic host/port - Implemented fast HTML formatting in web crawler - Enhanced markdown generation with a new generator class - Improved sanitization and utility functions - Added contributor details and pull request acknowledgments - Updated documentation for clearer usage scenarios - Adjusted tests to reflect class name changes --- CONTRIBUTORS.md | 8 +++ crawl4ai/__version__.py | 2 +- crawl4ai/async_crawler_strategy.py | 24 +++++---- crawl4ai/async_webcrawler.py | 12 +++-- crawl4ai/content_scraping_strategy.py | 19 ++++--- crawl4ai/markdown_generation_strategy.py | 14 ++++-- crawl4ai/utils.py | 64 +++++++++++++++++++++--- docs/md_v2/advanced/hooks-auth.md | 8 ++- tests/async/test_markdown_genertor.py | 14 +++--- 9 files changed, 123 insertions(+), 42 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 0b5dcede..deb46a9c 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -10,11 +10,19 @@ We would like to thank the following people for their contributions to Crawl4AI: ## Community Contributors +- [aadityakanjolia4](https://github.com/aadityakanjolia4) - Fix for `CustomHTML2Text` is not defined. - [FractalMind](https://github.com/FractalMind) - Created the first official Docker Hub image and fixed Dockerfile errors - [ketonkss4](https://github.com/ketonkss4) - Identified Selenium's new capabilities, helping reduce dependencies - [jonymusky](https://github.com/jonymusky) - Javascript execution documentation, and wait_for - [datehoer](https://github.com/datehoer) - Add browser prxy support +## Pull Requests + +- [nelzomal](https://github.com/nelzomal) - Enhance development installation instructions [#286](https://github.com/unclecode/crawl4ai/pull/286) +- [HamzaFarhan](https://github.com/HamzaFarhan) - Handled the cases where markdown_with_citations, references_markdown, and filtered_html might not be defined [#293](https://github.com/unclecode/crawl4ai/pull/293) +- [NanmiCoder](https://github.com/NanmiCoder) - fix: crawler strategy exception handling and fixes [#271](https://github.com/unclecode/crawl4ai/pull/271) + + ## Other Contributors - [Gokhan](https://github.com/gkhngyk) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 05bfd336..37e3c08a 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.741" \ No newline at end of file +__version__ = "0.3.743" \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 3f332eb0..882f9a50 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -35,13 +35,14 @@ stealth_config = StealthConfig( class ManagedBrowser: - def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None): + def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None, host: str = "localhost", debugging_port: int = 9222): self.browser_type = browser_type self.user_data_dir = user_data_dir self.headless = headless self.browser_process = None self.temp_dir = None - self.debugging_port = 9222 + self.debugging_port = debugging_port + self.host = host self.logger = logger self.shutting_down = False @@ -70,7 +71,7 @@ class ManagedBrowser: # Monitor browser process output for errors asyncio.create_task(self._monitor_browser_process()) await asyncio.sleep(2) # Give browser time to start - return f"http://localhost:{self.debugging_port}" + return f"http://{self.host}:{self.debugging_port}" except Exception as e: await self.cleanup() raise Exception(f"Failed to start browser: {e}") @@ -416,13 +417,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): else: raise ValueError(f"Invalid hook type: {hook_type}") - async def execute_hook(self, hook_type: str, *args): + async def execute_hook(self, hook_type: str, *args, **kwargs): hook = self.hooks.get(hook_type) if hook: if asyncio.iscoroutinefunction(hook): - return await hook(*args) + return await hook(*args, **kwargs) else: - return hook(*args) + return hook(*args, **kwargs) return args[0] if args else None def update_user_agent(self, user_agent: str): @@ -642,6 +643,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): session_id = kwargs.get("session_id") # Handle page creation differently for managed browser + context = None if self.use_managed_browser: if session_id: # Reuse existing session if available @@ -760,7 +762,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return response if not kwargs.get("js_only", False): - await self.execute_hook('before_goto', page) + await self.execute_hook('before_goto', page, context = context) response = await page.goto( @@ -773,7 +775,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # response = await page.goto("about:blank") # await page.evaluate(f"window.location.href = '{url}'") - await self.execute_hook('after_goto', page) + await self.execute_hook('after_goto', page, context = context) # Get status code and headers status_code = response.status @@ -838,7 +840,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # await page.wait_for_timeout(100) # Check for on execution event - await self.execute_hook('on_execution_started', page) + await self.execute_hook('on_execution_started', page, context = context) if kwargs.get("simulate_user", False) or kwargs.get("magic", False): # Simulate user interactions @@ -924,7 +926,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if kwargs.get("process_iframes", False): page = await self.process_iframes(page) - await self.execute_hook('before_retrieve_html', page) + await self.execute_hook('before_retrieve_html', page, context = context) # Check if delay_before_return_html is set then wait for that time delay_before_return_html = kwargs.get("delay_before_return_html") if delay_before_return_html: @@ -935,7 +937,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.remove_overlay_elements(page) html = await page.content() - await self.execute_hook('before_return_html', page, html) + await self.execute_hook('before_return_html', page, html, context = context) # Check if kwargs has screenshot=True then take screenshot screenshot_data = None diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index b8be6f35..5a46fe39 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -25,7 +25,8 @@ from .config import ( from .utils import ( sanitize_input_encode, InvalidCSSSelectorError, - format_html + format_html, + fast_format_html ) from urllib.parse import urlparse import random @@ -534,16 +535,17 @@ class AsyncWebCrawler: "timing": time.perf_counter() - t1 } ) - - - screenshot = None if not screenshot else screenshot + + if kwargs.get("prettiify", False): + cleaned_html = fast_format_html(cleaned_html) + return CrawlResult( url=url, html=html, - cleaned_html=format_html(cleaned_html), + cleaned_html=cleaned_html, markdown_v2=markdown_v2, markdown=markdown, fit_markdown=fit_markdown, diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index ea6a2ef8..ec6c3361 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -10,7 +10,7 @@ from urllib.parse import urljoin from requests.exceptions import InvalidSchema # from .content_cleaning_strategy import ContentCleaningStrategy from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter -from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerationStrategy +from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator from .models import MarkdownGenerationResult from .utils import ( sanitize_input_encode, @@ -105,21 +105,28 @@ class WebScrapingStrategy(ContentScrapingStrategy): Returns: Dict containing markdown content in various formats """ - markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerationStrategy()) + markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator()) if markdown_generator: try: + if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter: + markdown_generator.content_filter = BM25ContentFilter( + user_query=kwargs.get('fit_markdown_user_query', None), + bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) + ) + markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( cleaned_html=cleaned_html, base_url=url, - html2text_options=kwargs.get('html2text', {}), - content_filter=kwargs.get('content_filter', None) + html2text_options=kwargs.get('html2text', {}) ) + help_message = """""" + return { 'markdown': markdown_result.raw_markdown, - 'fit_markdown': markdown_result.fit_markdown or "Set flag 'fit_markdown' to True to get cleaned HTML content.", - 'fit_html': markdown_result.fit_html or "Set flag 'fit_markdown' to True to get cleaned HTML content.", + 'fit_markdown': markdown_result.fit_markdown, + 'fit_html': markdown_result.fit_html, 'markdown_v2': markdown_result } except Exception as e: diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index 7922c413..b1e43f9d 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -11,6 +11,8 @@ LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)') class MarkdownGenerationStrategy(ABC): """Abstract base class for markdown generation strategies.""" + def __init__(self, content_filter: Optional[RelevantContentFilter] = None): + self.content_filter = content_filter @abstractmethod def generate_markdown(self, @@ -23,8 +25,10 @@ class MarkdownGenerationStrategy(ABC): """Generate markdown from cleaned HTML.""" pass -class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): +class DefaultMarkdownGenerator(MarkdownGenerationStrategy): """Default implementation of markdown generation strategy.""" + def __init__(self, content_filter: Optional[RelevantContentFilter] = None): + super().__init__(content_filter) def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]: link_map = {} @@ -84,14 +88,18 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): raw_markdown = raw_markdown.replace(' ```', '```') # Convert links to citations + markdown_with_citations: str = "" + references_markdown: str = "" if citations: markdown_with_citations, references_markdown = self.convert_links_to_citations( raw_markdown, base_url ) # Generate fit markdown if content filter is provided - fit_markdown: Optional[str] = None - if content_filter: + fit_markdown: Optional[str] = "" + filtered_html: Optional[str] = "" + if content_filter or self.content_filter: + content_filter = content_filter or self.content_filter filtered_html = content_filter.filter_content(cleaned_html) filtered_html = '\n'.join('
    {}
    '.format(s) for s in filtered_html) fit_markdown = h.handle(filtered_html) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index b07562df..aaf27e91 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -233,12 +233,17 @@ def sanitize_html(html): def sanitize_input_encode(text: str) -> str: """Sanitize input to handle potential encoding issues.""" try: - # Attempt to encode and decode as UTF-8 to handle potential encoding issues - return text.encode('utf-8', errors='ignore').decode('utf-8') - except UnicodeEncodeError as e: - print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}") - # Fall back to ASCII if UTF-8 fails - return text.encode('ascii', errors='ignore').decode('ascii') + try: + if not text: + return '' + # Attempt to encode and decode as UTF-8 to handle potential encoding issues + return text.encode('utf-8', errors='ignore').decode('utf-8') + except UnicodeEncodeError as e: + print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}") + # Fall back to ASCII if UTF-8 fails + return text.encode('ascii', errors='ignore').decode('ascii') + except Exception as e: + raise ValueError(f"Error sanitizing input: {str(e)}") from e def escape_json_string(s): """ @@ -1079,9 +1084,54 @@ def wrap_text(draw, text, font, max_width): return '\n'.join(lines) def format_html(html_string): - soup = BeautifulSoup(html_string, 'html.parser') + soup = BeautifulSoup(html_string, 'lxml.parser') return soup.prettify() +def fast_format_html(html_string): + """ + A fast HTML formatter that uses string operations instead of parsing. + + Args: + html_string (str): The HTML string to format + + Returns: + str: The formatted HTML string + """ + # Initialize variables + indent = 0 + indent_str = " " # Two spaces for indentation + formatted = [] + in_content = False + + # Split by < and > to separate tags and content + parts = html_string.replace('>', '>\n').replace('<', '\n<').split('\n') + + for part in parts: + if not part.strip(): + continue + + # Handle closing tags + if part.startswith(''): + formatted.append(indent_str * indent + part) + + # Handle opening tags + elif part.startswith('<'): + formatted.append(indent_str * indent + part) + indent += 1 + + # Handle content between tags + else: + content = part.strip() + if content: + formatted.append(indent_str * indent + content) + + return '\n'.join(formatted) + def normalize_url(href, base_url): """Normalize URLs to ensure consistent format""" from urllib.parse import urljoin, urlparse diff --git a/docs/md_v2/advanced/hooks-auth.md b/docs/md_v2/advanced/hooks-auth.md index e4b7d7ce..8da3a1cc 100644 --- a/docs/md_v2/advanced/hooks-auth.md +++ b/docs/md_v2/advanced/hooks-auth.md @@ -18,7 +18,7 @@ Let's see how we can customize the AsyncWebCrawler using hooks! In this example, import asyncio from crawl4ai import AsyncWebCrawler from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy -from playwright.async_api import Page, Browser +from playwright.async_api import Page, Browser, BrowserContext async def on_browser_created(browser: Browser): print("[HOOK] on_browser_created") @@ -71,7 +71,11 @@ from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy async def main(): print("\n🔗 Using Crawler Hooks: Let's see how we can customize the AsyncWebCrawler using hooks!") - crawler_strategy = AsyncPlaywrightCrawlerStrategy(verbose=True) + initial_cookies = [ + {"name": "sessionId", "value": "abc123", "domain": ".example.com"}, + {"name": "userId", "value": "12345", "domain": ".example.com"} + ] + crawler_strategy = AsyncPlaywrightCrawlerStrategy(verbose=True, cookies=initial_cookies) crawler_strategy.set_hook('on_browser_created', on_browser_created) crawler_strategy.set_hook('before_goto', before_goto) crawler_strategy.set_hook('after_goto', after_goto) diff --git a/tests/async/test_markdown_genertor.py b/tests/async/test_markdown_genertor.py index 025a0318..2b1102ab 100644 --- a/tests/async/test_markdown_genertor.py +++ b/tests/async/test_markdown_genertor.py @@ -11,7 +11,7 @@ import asyncio import os import time from typing import Dict, Any -from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerationStrategy +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator # Get current directory __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) @@ -41,7 +41,7 @@ def test_basic_markdown_conversion(): with open(__location__ + "/data/wikipedia.html", "r") as f: cleaned_html = f.read() - generator = DefaultMarkdownGenerationStrategy() + generator = DefaultMarkdownGenerator() start_time = time.perf_counter() result = generator.generate_markdown( @@ -70,7 +70,7 @@ def test_relative_links(): Also an [image](/images/test.png) and another [page](/wiki/Banana). """ - generator = DefaultMarkdownGenerationStrategy() + generator = DefaultMarkdownGenerator() result = generator.generate_markdown( cleaned_html=markdown, base_url="https://en.wikipedia.org" @@ -86,7 +86,7 @@ def test_duplicate_links(): Here's a [link](/test) and another [link](/test) and a [different link](/other). """ - generator = DefaultMarkdownGenerationStrategy() + generator = DefaultMarkdownGenerator() result = generator.generate_markdown( cleaned_html=markdown, base_url="https://example.com" @@ -102,7 +102,7 @@ def test_link_descriptions(): Here's a [link with title](/test "Test Title") and a [link with description](/other) to test. """ - generator = DefaultMarkdownGenerationStrategy() + generator = DefaultMarkdownGenerator() result = generator.generate_markdown( cleaned_html=markdown, base_url="https://example.com" @@ -120,7 +120,7 @@ def test_performance_large_document(): iterations = 5 times = [] - generator = DefaultMarkdownGenerationStrategy() + generator = DefaultMarkdownGenerator() for i in range(iterations): start_time = time.perf_counter() @@ -144,7 +144,7 @@ def test_image_links(): And a regular [link](/page). """ - generator = DefaultMarkdownGenerationStrategy() + generator = DefaultMarkdownGenerator() result = generator.generate_markdown( cleaned_html=markdown, base_url="https://example.com" From 3ff0b0b2c472f6adfd864f580a5a73de65505e5b Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 12:48:07 +0800 Subject: [PATCH 075/115] feat: update changelog for version 0.3.743 with new features, improvements, and contributor acknowledgments --- CHANGELOG.md | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8e5cc91a..5ec79639 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,53 @@ # Changelog +## [0.3.743] November 27, 2024 + +Enhance features and documentation +- Updated version to 0.3.743 +- Improved ManagedBrowser configuration with dynamic host/port +- Implemented fast HTML formatting in web crawler +- Enhanced markdown generation with a new generator class +- Improved sanitization and utility functions +- Added contributor details and pull request acknowledgments +- Updated documentation for clearer usage scenarios +- Adjusted tests to reflect class name changes + +### CONTRIBUTORS.md +Added new contributors and pull request details. +Updated community contributions and acknowledged pull requests. + +### crawl4ai/__version__.py +Version update. +Bumped version to 0.3.743. + +### crawl4ai/async_crawler_strategy.py +Improved ManagedBrowser configuration. +Enhanced browser initialization with configurable host and debugging port; improved hook execution. + +### crawl4ai/async_webcrawler.py +Optimized HTML processing. +Implemented 'fast_format_html' for optimized HTML formatting; applied it when 'prettiify' is enabled. + +### crawl4ai/content_scraping_strategy.py +Enhanced markdown generation strategy. +Updated to use DefaultMarkdownGenerator and improved markdown generation with filters option. + +### crawl4ai/markdown_generation_strategy.py +Refactored markdown generation class. +Renamed DefaultMarkdownGenerationStrategy to DefaultMarkdownGenerator; added content filter handling. + +### crawl4ai/utils.py +Enhanced utility functions. +Improved input sanitization and enhanced HTML formatting method. + +### docs/md_v2/advanced/hooks-auth.md +Improved documentation for hooks. +Updated code examples to include cookies in crawler strategy initialization. + +### tests/async/test_markdown_genertor.py +Refactored tests to match class renaming. +Updated tests to use renamed DefaultMarkdownGenerator class. + ## [0.3.74] November 17, 2024 This changelog details the updates and changes introduced in Crawl4AI version 0.3.74. It's designed to inform developers about new features, modifications to existing components, removals, and other important information. From c2d47848102138e226ab06a4e2c40c80aef2a2cd Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 12:56:31 +0800 Subject: [PATCH 076/115] fix: resolve merge conflict in DefaultMarkdownGenerator affecting fit_markdown generation --- crawl4ai/markdown_generation_strategy.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index 377f6c84..f242054d 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -96,7 +96,6 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy): ) # Generate fit markdown if content filter is provided -<<<<<<< HEAD fit_markdown: Optional[str] = "" filtered_html: Optional[str] = "" if content_filter or self.content_filter: @@ -104,8 +103,6 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy): filtered_html = content_filter.filter_content(cleaned_html) filtered_html = '\n'.join('
    {}
    '.format(s) for s in filtered_html) fit_markdown = h.handle(filtered_html) ->>>>>>> origin/main - fit_markdown = h.handle(filtered_html) return MarkdownGenerationResult( raw_markdown=raw_markdown, From e4acd18429cf93ae7cd454c6b433fad703dee21c Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 13:06:30 +0800 Subject: [PATCH 077/115] docs: update README for version 0.3.743 with new features, enhancements, and contributor acknowledgments --- README.md | 125 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 87 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 5ba33dea..16d154b5 100644 --- a/README.md +++ b/README.md @@ -11,20 +11,15 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. 🆓🌐 -## New in 0.3.74 ✨ +## New in 0.3.743 ✨ -- 🚀 **Blazing Fast Scraping**: Significantly improved scraping speed. -- 📥 **Download Manager**: Integrated file crawling, downloading, and tracking within `CrawlResult`. -- 📝 **Markdown Strategy**: Flexible system for custom markdown generation and formats. -- 🔗 **LLM-Friendly Citations**: Auto-converts links to numbered citations with reference lists. -- 🔎 **Markdown Filter**: BM25-based content extraction for cleaner, relevant markdown. -- 🖼️ **Image Extraction**: Supports `srcset`, `picture`, and responsive image formats. -- 🗂️ **Local/Raw HTML**: Crawl `file://` paths and raw HTML (`raw:`) directly. -- 🤖 **Browser Control**: Custom browser setups with stealth integration to bypass bots. -- ☁️ **API & Cache Boost**: CORS, static serving, and enhanced filesystem-based caching. -- 🐳 **API Gateway**: Run as an API service with secure token authentication. -- 🛠️ **Database Upgrades**: Optimized for larger content sets with faster caching. -- 🐛 **Bug Fixes**: Resolved browser context issues, memory leaks, and improved error handling. +🚀 **Improved ManagedBrowser Configuration**: Dynamic host and port support for more flexible browser management. +📝 **Enhanced Markdown Generation**: New generator class for better formatting and customization. +⚡ **Fast HTML Formatting**: Significantly optimized HTML formatting in the web crawler. +🛠️ **Utility & Sanitization Upgrades**: Improved sanitization and expanded utility functions for streamlined workflows. +👥 **Acknowledgments**: Added contributor details and pull request acknowledgments for better transparency. +📖 **Documentation Updates**: Clearer usage scenarios and updated guidance for better user onboarding. +🧪 **Test Adjustments**: Refined tests to align with recent class name changes. ## Try it Now! @@ -35,31 +30,85 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc ## Features ✨ -- 🆓 Completely free and open-source -- 🚀 Blazing fast performance, outperforming many paid services -- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown) -- 🌐 Multi-browser support (Chromium, Firefox, WebKit) -- 🌍 Supports crawling multiple URLs simultaneously -- 🎨 Extracts and returns all media tags (Images, Audio, and Video) -- 🔗 Extracts all external and internal links -- 📚 Extracts metadata from the page -- 🔄 Custom hooks for authentication, headers, and page modifications -- 🕵️ User-agent customization -- 🖼️ Takes screenshots of pages with enhanced error handling -- 📜 Executes multiple custom JavaScripts before crawling -- 📊 Generates structured output without LLM using JsonCssExtractionStrategy -- 📚 Various chunking strategies: topic-based, regex, sentence, and more -- 🧠 Advanced extraction strategies: cosine clustering, LLM, and more -- 🎯 CSS selector support for precise data extraction -- 📝 Passes instructions/keywords to refine extraction -- 🔒 Proxy support with authentication for enhanced access -- 🔄 Session management for complex multi-page crawling -- 🌐 Asynchronous architecture for improved performance -- 🖼️ Improved image processing with lazy-loading detection -- 🕰️ Enhanced handling of delayed content loading -- 🔑 Custom headers support for LLM interactions -- 🖼️ iframe content extraction for comprehensive analysis -- ⏱️ Flexible timeout and delayed content retrieval options +
    +🚀 Performance & Scalability + +- ⚡ **Blazing Fast Scraping**: Outperforms many paid services with cutting-edge optimization. +- 🔄 **Asynchronous Architecture**: Enhanced performance for complex multi-page crawling. +- ⚡ **Dynamic HTML Formatting**: New, fast HTML formatting for streamlined workflows. +- 🗂️ **Large Dataset Optimization**: Improved caching for handling massive content sets. + +
    + +
    +🔎 Extraction Capabilities + +- 🖼️ **Comprehensive Media Support**: Extracts images, audio, video, and responsive image formats like `srcset` and `picture`. +- 📚 **Advanced Content Chunking**: Topic-based, regex, sentence-level, and cosine clustering strategies. +- 🎯 **Precise Data Extraction**: Supports CSS selectors and keyword-based refinements. +- 🔗 **All-Inclusive Link Crawling**: Extracts internal and external links. +- 📝 **Markdown Generation**: Enhanced markdown generator class for custom, clean, LLM-friendly outputs. +- 🏷️ **Metadata Extraction**: Fetches metadata directly from pages. + +
    + +
    +🌐 Browser Integration + +- 🌍 **Multi-Browser Support**: Works with Chromium, Firefox, and WebKit. +- 🖥️ **ManagedBrowser with Dynamic Config**: Flexible host/port control for tailored setups. +- ⚙️ **Custom Browser Hooks**: Authentication, headers, and page modifications. +- 🕶️ **Stealth Mode**: Bypasses bot detection with advanced techniques. +- 📸 **Screenshots & JavaScript Execution**: Takes screenshots and executes custom JavaScript before crawling. + +
    + +
    +📁 Input/Output Flexibility + +- 📂 **Local & Raw HTML Crawling**: Directly processes `file://` paths and raw HTML. +- 🌐 **Custom Headers for LLM**: Tailored headers for enhanced AI interactions. +- 🛠️ **Structured Output Options**: Supports JSON, cleaned HTML, and markdown outputs. + +
    + +
    +🔧 Utility & Debugging + +- 🛡️ **Error Handling**: Robust error management for seamless execution. +- 🔐 **Session Management**: Handles complex, multi-page interactions. +- 🧹 **Utility Functions**: Enhanced sanitization and flexible extraction helpers. +- 🕰️ **Delayed Content Loading**: Improved handling of lazy-loading and dynamic content. + +
    + +
    +🔐 Security & Accessibility + +- 🕵️ **Proxy Support**: Enables authenticated access for restricted pages. +- 🚪 **API Gateway**: Deploy as an API service with secure token authentication. +- 🌐 **CORS & Static Serving**: Enhanced support for filesystem-based caching and cross-origin requests. + +
    + +
    +🌟 Community & Documentation + +- 🙌 **Contributor Acknowledgments**: Recognition for pull requests and contributions. +- 📖 **Clear Documentation**: Simplified and updated for better onboarding and usage. + +
    + +
    +🎯 Cutting-Edge Features + +- 🛠️ **BM25-Based Markdown Filtering**: Extracts cleaner, context-relevant markdown. +- 📚 **LLM-Friendly Citations**: Auto-converts links to numbered citations with reference lists. +- 📡 **IFrame Content Extraction**: Comprehensive analysis for embedded content. +- 🕰️ **Flexible Content Retrieval**: Combines timing-based strategies for reliable extractions. + +
    + ## Installation 🛠️ From ce7d49484fc097a834d1eac883ecce6f444ceb1e Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 13:06:46 +0800 Subject: [PATCH 078/115] docs: update README for version 0.3.743 with new features, enhancements, and contributor acknowledgments --- README.md | 125 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 87 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 5ba33dea..16d154b5 100644 --- a/README.md +++ b/README.md @@ -11,20 +11,15 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. 🆓🌐 -## New in 0.3.74 ✨ +## New in 0.3.743 ✨ -- 🚀 **Blazing Fast Scraping**: Significantly improved scraping speed. -- 📥 **Download Manager**: Integrated file crawling, downloading, and tracking within `CrawlResult`. -- 📝 **Markdown Strategy**: Flexible system for custom markdown generation and formats. -- 🔗 **LLM-Friendly Citations**: Auto-converts links to numbered citations with reference lists. -- 🔎 **Markdown Filter**: BM25-based content extraction for cleaner, relevant markdown. -- 🖼️ **Image Extraction**: Supports `srcset`, `picture`, and responsive image formats. -- 🗂️ **Local/Raw HTML**: Crawl `file://` paths and raw HTML (`raw:`) directly. -- 🤖 **Browser Control**: Custom browser setups with stealth integration to bypass bots. -- ☁️ **API & Cache Boost**: CORS, static serving, and enhanced filesystem-based caching. -- 🐳 **API Gateway**: Run as an API service with secure token authentication. -- 🛠️ **Database Upgrades**: Optimized for larger content sets with faster caching. -- 🐛 **Bug Fixes**: Resolved browser context issues, memory leaks, and improved error handling. +🚀 **Improved ManagedBrowser Configuration**: Dynamic host and port support for more flexible browser management. +📝 **Enhanced Markdown Generation**: New generator class for better formatting and customization. +⚡ **Fast HTML Formatting**: Significantly optimized HTML formatting in the web crawler. +🛠️ **Utility & Sanitization Upgrades**: Improved sanitization and expanded utility functions for streamlined workflows. +👥 **Acknowledgments**: Added contributor details and pull request acknowledgments for better transparency. +📖 **Documentation Updates**: Clearer usage scenarios and updated guidance for better user onboarding. +🧪 **Test Adjustments**: Refined tests to align with recent class name changes. ## Try it Now! @@ -35,31 +30,85 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc ## Features ✨ -- 🆓 Completely free and open-source -- 🚀 Blazing fast performance, outperforming many paid services -- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown) -- 🌐 Multi-browser support (Chromium, Firefox, WebKit) -- 🌍 Supports crawling multiple URLs simultaneously -- 🎨 Extracts and returns all media tags (Images, Audio, and Video) -- 🔗 Extracts all external and internal links -- 📚 Extracts metadata from the page -- 🔄 Custom hooks for authentication, headers, and page modifications -- 🕵️ User-agent customization -- 🖼️ Takes screenshots of pages with enhanced error handling -- 📜 Executes multiple custom JavaScripts before crawling -- 📊 Generates structured output without LLM using JsonCssExtractionStrategy -- 📚 Various chunking strategies: topic-based, regex, sentence, and more -- 🧠 Advanced extraction strategies: cosine clustering, LLM, and more -- 🎯 CSS selector support for precise data extraction -- 📝 Passes instructions/keywords to refine extraction -- 🔒 Proxy support with authentication for enhanced access -- 🔄 Session management for complex multi-page crawling -- 🌐 Asynchronous architecture for improved performance -- 🖼️ Improved image processing with lazy-loading detection -- 🕰️ Enhanced handling of delayed content loading -- 🔑 Custom headers support for LLM interactions -- 🖼️ iframe content extraction for comprehensive analysis -- ⏱️ Flexible timeout and delayed content retrieval options +
    +🚀 Performance & Scalability + +- ⚡ **Blazing Fast Scraping**: Outperforms many paid services with cutting-edge optimization. +- 🔄 **Asynchronous Architecture**: Enhanced performance for complex multi-page crawling. +- ⚡ **Dynamic HTML Formatting**: New, fast HTML formatting for streamlined workflows. +- 🗂️ **Large Dataset Optimization**: Improved caching for handling massive content sets. + +
    + +
    +🔎 Extraction Capabilities + +- 🖼️ **Comprehensive Media Support**: Extracts images, audio, video, and responsive image formats like `srcset` and `picture`. +- 📚 **Advanced Content Chunking**: Topic-based, regex, sentence-level, and cosine clustering strategies. +- 🎯 **Precise Data Extraction**: Supports CSS selectors and keyword-based refinements. +- 🔗 **All-Inclusive Link Crawling**: Extracts internal and external links. +- 📝 **Markdown Generation**: Enhanced markdown generator class for custom, clean, LLM-friendly outputs. +- 🏷️ **Metadata Extraction**: Fetches metadata directly from pages. + +
    + +
    +🌐 Browser Integration + +- 🌍 **Multi-Browser Support**: Works with Chromium, Firefox, and WebKit. +- 🖥️ **ManagedBrowser with Dynamic Config**: Flexible host/port control for tailored setups. +- ⚙️ **Custom Browser Hooks**: Authentication, headers, and page modifications. +- 🕶️ **Stealth Mode**: Bypasses bot detection with advanced techniques. +- 📸 **Screenshots & JavaScript Execution**: Takes screenshots and executes custom JavaScript before crawling. + +
    + +
    +📁 Input/Output Flexibility + +- 📂 **Local & Raw HTML Crawling**: Directly processes `file://` paths and raw HTML. +- 🌐 **Custom Headers for LLM**: Tailored headers for enhanced AI interactions. +- 🛠️ **Structured Output Options**: Supports JSON, cleaned HTML, and markdown outputs. + +
    + +
    +🔧 Utility & Debugging + +- 🛡️ **Error Handling**: Robust error management for seamless execution. +- 🔐 **Session Management**: Handles complex, multi-page interactions. +- 🧹 **Utility Functions**: Enhanced sanitization and flexible extraction helpers. +- 🕰️ **Delayed Content Loading**: Improved handling of lazy-loading and dynamic content. + +
    + +
    +🔐 Security & Accessibility + +- 🕵️ **Proxy Support**: Enables authenticated access for restricted pages. +- 🚪 **API Gateway**: Deploy as an API service with secure token authentication. +- 🌐 **CORS & Static Serving**: Enhanced support for filesystem-based caching and cross-origin requests. + +
    + +
    +🌟 Community & Documentation + +- 🙌 **Contributor Acknowledgments**: Recognition for pull requests and contributions. +- 📖 **Clear Documentation**: Simplified and updated for better onboarding and usage. + +
    + +
    +🎯 Cutting-Edge Features + +- 🛠️ **BM25-Based Markdown Filtering**: Extracts cleaner, context-relevant markdown. +- 📚 **LLM-Friendly Citations**: Auto-converts links to numbered citations with reference lists. +- 📡 **IFrame Content Extraction**: Comprehensive analysis for embedded content. +- 🕰️ **Flexible Content Retrieval**: Combines timing-based strategies for reliable extractions. + +
    + ## Installation 🛠️ From d556dada9fb4003b42cf7d619ff44feef478cf2c Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 13:07:33 +0800 Subject: [PATCH 079/115] docs: update README to keep details open for extraction capabilities, browser integration, input/output flexibility, utility & debugging, security & accessibility, community & documentation, and cutting-edge features --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 16d154b5..cd643211 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc -
    +
    🔎 Extraction Capabilities - 🖼️ **Comprehensive Media Support**: Extracts images, audio, video, and responsive image formats like `srcset` and `picture`. @@ -52,7 +52,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
    -
    +
    🌐 Browser Integration - 🌍 **Multi-Browser Support**: Works with Chromium, Firefox, and WebKit. @@ -63,7 +63,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
    -
    +
    📁 Input/Output Flexibility - 📂 **Local & Raw HTML Crawling**: Directly processes `file://` paths and raw HTML. @@ -72,7 +72,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
    -
    +
    🔧 Utility & Debugging - 🛡️ **Error Handling**: Robust error management for seamless execution. @@ -82,7 +82,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
    -
    +
    🔐 Security & Accessibility - 🕵️ **Proxy Support**: Enables authenticated access for restricted pages. @@ -91,7 +91,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
    -
    +
    🌟 Community & Documentation - 🙌 **Contributor Acknowledgments**: Recognition for pull requests and contributions. @@ -99,7 +99,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
    -
    +
    🎯 Cutting-Edge Features - 🛠️ **BM25-Based Markdown Filtering**: Extracts cleaner, context-relevant markdown. From 3abb573142d5588a1fc5790e2731ca8641ca4a95 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 13:07:59 +0800 Subject: [PATCH 080/115] docs: update README for version 0.3.743 with improved formatting and contributor acknowledgments --- README.md | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index cd643211..e02d7ef8 100644 --- a/README.md +++ b/README.md @@ -13,13 +13,11 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc ## New in 0.3.743 ✨ -🚀 **Improved ManagedBrowser Configuration**: Dynamic host and port support for more flexible browser management. -📝 **Enhanced Markdown Generation**: New generator class for better formatting and customization. -⚡ **Fast HTML Formatting**: Significantly optimized HTML formatting in the web crawler. -🛠️ **Utility & Sanitization Upgrades**: Improved sanitization and expanded utility functions for streamlined workflows. -👥 **Acknowledgments**: Added contributor details and pull request acknowledgments for better transparency. -📖 **Documentation Updates**: Clearer usage scenarios and updated guidance for better user onboarding. -🧪 **Test Adjustments**: Refined tests to align with recent class name changes. +- 🚀 **Improved ManagedBrowser Configuration**: Dynamic host and port support for more flexible browser management. +- 📝 **Enhanced Markdown Generation**: New generator class for better formatting and customization. +- ⚡ **Fast HTML Formatting**: Significantly optimized HTML formatting in the web crawler. +- 🛠️ **Utility & Sanitization Upgrades**: Improved sanitization and expanded utility functions for streamlined workflows. +- 👥 **Acknowledgments**: Added contributor details and pull request acknowledgments for better transparency. ## Try it Now! From d583aa43ca1404788838820ebfb90d2e8ee8680d Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 15:53:25 +0800 Subject: [PATCH 081/115] refactor: update cache handling in quickstart_async example to use CacheMode enum --- README.md | 470 +++++++++++++++--------------- docs/examples/quickstart_async.py | 95 +++--- 2 files changed, 296 insertions(+), 269 deletions(-) diff --git a/README.md b/README.md index e02d7ef8..5c50cdc5 100644 --- a/README.md +++ b/README.md @@ -29,94 +29,86 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc ## Features ✨
    -🚀 Performance & Scalability - -- ⚡ **Blazing Fast Scraping**: Outperforms many paid services with cutting-edge optimization. -- 🔄 **Asynchronous Architecture**: Enhanced performance for complex multi-page crawling. -- ⚡ **Dynamic HTML Formatting**: New, fast HTML formatting for streamlined workflows. -- 🗂️ **Large Dataset Optimization**: Improved caching for handling massive content sets. +📝 Markdown Generation +- 🧹 **Clean Markdown**: Generates clean, structured Markdown with accurate formatting. +- 🎯 **Fit Markdown**: Heuristic-based filtering to remove noise and irrelevant parts for AI-friendly processing. +- 🔗 **Citations and References**: Converts page links into a numbered reference list with clean citations. +- 🛠️ **Custom Strategies**: Users can create their own Markdown generation strategies tailored to specific needs. +- 📚 **BM25 Algorithm**: Employs BM25-based filtering for extracting core information and removing irrelevant content.
    -🔎 Extraction Capabilities +📊 Structured Data Extraction -- 🖼️ **Comprehensive Media Support**: Extracts images, audio, video, and responsive image formats like `srcset` and `picture`. -- 📚 **Advanced Content Chunking**: Topic-based, regex, sentence-level, and cosine clustering strategies. -- 🎯 **Precise Data Extraction**: Supports CSS selectors and keyword-based refinements. -- 🔗 **All-Inclusive Link Crawling**: Extracts internal and external links. -- 📝 **Markdown Generation**: Enhanced markdown generator class for custom, clean, LLM-friendly outputs. -- 🏷️ **Metadata Extraction**: Fetches metadata directly from pages. +- 🤖 **LLM-Driven Extraction**: Supports all LLMs (open-source and proprietary) for structured data extraction. +- 🧱 **Chunking Strategies**: Implements chunking (topic-based, regex, sentence-level) for targeted content processing. +- 🌌 **Cosine Similarity**: Find relevant content chunks based on user queries for semantic extraction. +- 🔎 **CSS-Based Extraction**: Fast schema-based data extraction using XPath and CSS selectors. +- 🔧 **Schema Definition**: Define custom schemas for extracting structured JSON from repetitive patterns.
    🌐 Browser Integration -- 🌍 **Multi-Browser Support**: Works with Chromium, Firefox, and WebKit. -- 🖥️ **ManagedBrowser with Dynamic Config**: Flexible host/port control for tailored setups. -- ⚙️ **Custom Browser Hooks**: Authentication, headers, and page modifications. -- 🕶️ **Stealth Mode**: Bypasses bot detection with advanced techniques. -- 📸 **Screenshots & JavaScript Execution**: Takes screenshots and executes custom JavaScript before crawling. +- 🖥️ **Managed Browser**: Use user-owned browsers with full control, avoiding bot detection. +- 🔄 **Remote Browser Control**: Connect to Chrome Developer Tools Protocol for remote, large-scale data extraction. +- 🔒 **Session Management**: Preserve browser states and reuse them for multi-step crawling. +- 🧩 **Proxy Support**: Seamlessly connect to proxies with authentication for secure access. +- ⚙️ **Full Browser Control**: Modify headers, cookies, user agents, and more for tailored crawling setups. +- 🌍 **Multi-Browser Support**: Compatible with Chromium, Firefox, and WebKit.
    -📁 Input/Output Flexibility +🔎 Crawling & Scraping -- 📂 **Local & Raw HTML Crawling**: Directly processes `file://` paths and raw HTML. -- 🌐 **Custom Headers for LLM**: Tailored headers for enhanced AI interactions. -- 🛠️ **Structured Output Options**: Supports JSON, cleaned HTML, and markdown outputs. +- 🖼️ **Media Support**: Extract images, audio, videos, and responsive image formats like `srcset` and `picture`. +- 🚀 **Dynamic Crawling**: Execute JS and wait for async or sync for dynamic content extraction. +- 📸 **Screenshots**: Capture page screenshots during crawling for debugging or analysis. +- 📂 **Raw Data Crawling**: Directly process raw HTML (`raw:`) or local files (`file://`). +- 🔗 **Comprehensive Link Extraction**: Extracts internal, external links, and embedded iframe content. +- 🛠️ **Customizable Hooks**: Define hooks at every step to customize crawling behavior. +- 💾 **Caching**: Cache data for improved speed and to avoid redundant fetches. +- 📄 **Metadata Extraction**: Retrieve structured metadata from web pages. +- 📡 **IFrame Content Extraction**: Seamless extraction from embedded iframe content.
    -🔧 Utility & Debugging +🚀 Deployment +- 🐳 **Dockerized Setup**: Optimized Docker image with API server for easy deployment. +- 🔄 **API Gateway**: One-click deployment with secure token authentication for API-based workflows. +- 🌐 **Scalable Architecture**: Designed for mass-scale production and optimized server performance. +- ⚙️ **DigitalOcean Deployment**: Ready-to-deploy configurations for DigitalOcean and similar platforms. + +
    + +
    +🎯 Additional Features + +- 🕶️ **Stealth Mode**: Avoid bot detection by mimicking real users. +- 🏷️ **Tag-Based Content Extraction**: Refine crawling based on custom tags, headers, or metadata. +- 🔗 **Link Analysis**: Extract and analyze all links for detailed data exploration. - 🛡️ **Error Handling**: Robust error management for seamless execution. -- 🔐 **Session Management**: Handles complex, multi-page interactions. -- 🧹 **Utility Functions**: Enhanced sanitization and flexible extraction helpers. -- 🕰️ **Delayed Content Loading**: Improved handling of lazy-loading and dynamic content. +- 🔐 **CORS & Static Serving**: Supports filesystem-based caching and cross-origin requests. +- 📖 **Clear Documentation**: Simplified and updated guides for onboarding and advanced usage. +- 🙌 **Community Recognition**: Acknowledges contributors and pull requests for transparency.
    -
    -🔐 Security & Accessibility - -- 🕵️ **Proxy Support**: Enables authenticated access for restricted pages. -- 🚪 **API Gateway**: Deploy as an API service with secure token authentication. -- 🌐 **CORS & Static Serving**: Enhanced support for filesystem-based caching and cross-origin requests. - -
    - -
    -🌟 Community & Documentation - -- 🙌 **Contributor Acknowledgments**: Recognition for pull requests and contributions. -- 📖 **Clear Documentation**: Simplified and updated for better onboarding and usage. - -
    - -
    -🎯 Cutting-Edge Features - -- 🛠️ **BM25-Based Markdown Filtering**: Extracts cleaner, context-relevant markdown. -- 📚 **LLM-Friendly Citations**: Auto-converts links to numbered citations with reference lists. -- 📡 **IFrame Content Extraction**: Comprehensive analysis for embedded content. -- 🕰️ **Flexible Content Retrieval**: Combines timing-based strategies for reliable extractions. - -
    - - ## Installation 🛠️ Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker. -### Using pip 🐍 +
    +🐍 Using pip Choose the installation option that best fits your needs: -#### Basic Installation +### Basic Installation For basic web crawling and scraping tasks: @@ -126,7 +118,7 @@ pip install crawl4ai By default, this will install the asynchronous version of Crawl4AI, using Playwright for web crawling. -👉 Note: When you install Crawl4AI, the setup script should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods: +👉 **Note**: When you install Crawl4AI, the setup script should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods: 1. Through the command line: @@ -142,15 +134,19 @@ By default, this will install the asynchronous version of Crawl4AI, using Playwr This second method has proven to be more reliable in some cases. -#### Installation with Synchronous Version +--- -If you need the synchronous version using Selenium: +### Installation with Synchronous Version + +The sync version is deprecated and will be removed in future versions. If you need the synchronous version using Selenium: ```bash pip install crawl4ai[sync] ``` -#### Development Installation +--- + +### Development Installation For contributors who plan to modify the source code: @@ -159,7 +155,9 @@ git clone https://github.com/unclecode/crawl4ai.git cd crawl4ai pip install -e . # Basic installation in editable mode ``` + Install optional features: + ```bash pip install -e ".[torch]" # With PyTorch features pip install -e ".[transformer]" # With Transformer features @@ -168,7 +166,10 @@ pip install -e ".[sync]" # With synchronous crawling (Selenium) pip install -e ".[all]" # Install all optional features ``` -## One-Click Deployment 🚀 +
    + +
    +🚀 One-Click Deployment Deploy your own instance of Crawl4AI with one click: @@ -179,14 +180,19 @@ Deploy your own instance of Crawl4AI with one click: The deploy will: - Set up a Docker container with Crawl4AI - Configure Playwright and all dependencies -- Start the FastAPI server on port 11235 +- Start the FastAPI server on port `11235` - Set up health checks and auto-deployment -### Using Docker 🐳 +
    + +
    +🐳 Using Docker Crawl4AI is available as Docker images for easy deployment. You can either pull directly from Docker Hub (recommended) or build from the repository. -#### Option 1: Docker Hub (Recommended) +--- + +### Option 1: Docker Hub (Recommended) ```bash # Pull and run from Docker Hub (choose one): @@ -204,7 +210,9 @@ docker run --platform linux/arm64 -p 11235:11235 unclecode/crawl4ai:basic docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic ``` -#### Option 2: Build from Repository +--- + +### Option 2: Build from Repository ```bash # Clone the repository @@ -226,7 +234,12 @@ docker build -t crawl4ai:local \ docker run -p 11235:11235 crawl4ai:local ``` -Quick test (works for both options): +--- + +### Quick Test + +Run a quick test (works for both Docker options): + ```python import requests @@ -243,143 +256,149 @@ result = requests.get(f"http://localhost:11235/task/{task_id}") For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://crawl4ai.com/mkdocs/basic/docker-deployment/). +
    + ## Quick Start 🚀 ```python import asyncio -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode async def main(): async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun(url="https://www.nbcnews.com/business") - print(result.markdown) + print(result.markdown_v2.raw_markdown) # Soone will be change to result.markdown if __name__ == "__main__": asyncio.run(main()) ``` -## Advanced Usage 🔬 +## Advanced Usage Examples 🔬 -### Executing JavaScript and Using CSS Selectors +You can check the project structure in the directory [https://github.com/unclecode/crawl4ai/docs/examples](docs/examples). Over there, you can find a variety of examples; here, some popular examples are shared. + +
    +🖥️ Heuristic Markdown Generation with Clean and Fit Markdown ```python import asyncio -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.content_filter_strategy import BM25ContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator async def main(): - async with AsyncWebCrawler(verbose=True) as crawler: - js_code = ["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"] + async with AsyncWebCrawler( + headless=True, + verbose=True, + ) as crawler: result = await crawler.arun( - url="https://www.nbcnews.com/business", - js_code=js_code, - css_selector=".wide-tease-item__description", - bypass_cache=True + url="https://docs.micronaut.io/4.7.6/guide/", + cache_mode=CacheMode.ENABLED, + markdown_generator=DefaultMarkdownGenerator( + content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0) + ), ) - print(result.extracted_content) + print(len(result.markdown)) + print(len(result.fit_markdown)) + print(len(result.markdown_v2.fit_markdown)) if __name__ == "__main__": asyncio.run(main()) ``` -### Using a Proxy +
    + +
    +🖥️ Structured Data Extraction and Executing JavaScript ```python import asyncio -from crawl4ai import AsyncWebCrawler - -async def main(): - async with AsyncWebCrawler(verbose=True, proxy="http://127.0.0.1:7890") as crawler: - result = await crawler.arun( - url="https://www.nbcnews.com/business", - bypass_cache=True - ) - print(result.markdown) - -if __name__ == "__main__": - asyncio.run(main()) -``` - -### Extracting Structured Data without LLM - -The `JsonCssExtractionStrategy` allows for precise extraction of structured data from web pages using CSS selectors. - -```python -import asyncio -import json -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +import json -async def extract_news_teasers(): +async def main(): schema = { - "name": "News Teaser Extractor", - "baseSelector": ".wide-tease-item__wrapper", - "fields": [ - { - "name": "category", - "selector": ".unibrow span[data-testid='unibrow-text']", - "type": "text", - }, - { - "name": "headline", - "selector": ".wide-tease-item__headline", - "type": "text", - }, - { - "name": "summary", - "selector": ".wide-tease-item__description", - "type": "text", - }, - { - "name": "time", - "selector": "[data-testid='wide-tease-date']", - "type": "text", - }, - { - "name": "image", - "type": "nested", - "selector": "picture.teasePicture img", - "fields": [ - {"name": "src", "type": "attribute", "attribute": "src"}, - {"name": "alt", "type": "attribute", "attribute": "alt"}, - ], - }, - { - "name": "link", - "selector": "a[href]", - "type": "attribute", - "attribute": "href", - }, - ], - } + "name": "KidoCode Courses", + "baseSelector": "section.charge-methodology .w-tab-content > div", + "fields": [ + { + "name": "section_title", + "selector": "h3.heading-50", + "type": "text", + }, + { + "name": "section_description", + "selector": ".charge-content", + "type": "text", + }, + { + "name": "course_name", + "selector": ".text-block-93", + "type": "text", + }, + { + "name": "course_description", + "selector": ".course-content-text", + "type": "text", + }, + { + "name": "course_icon", + "selector": ".image-92", + "type": "attribute", + "attribute": "src" + } + ] +} extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) - async with AsyncWebCrawler(verbose=True) as crawler: + async with AsyncWebCrawler( + headless=False, + verbose=True + ) as crawler: + + # Create the JavaScript that handles clicking multiple times + js_click_tabs = """ + (async () => { + const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div"); + + for(let tab of tabs) { + // scroll to the tab + tab.scrollIntoView(); + tab.click(); + // Wait for content to load and animations to complete + await new Promise(r => setTimeout(r, 500)); + } + })(); + """ + result = await crawler.arun( - url="https://www.nbcnews.com/business", - extraction_strategy=extraction_strategy, - bypass_cache=True, + url="https://www.kidocode.com/degrees/technology", + extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True), + js_code=[js_click_tabs], + cache_mode=CacheMode.BYPASS ) - assert result.success, "Failed to crawl the page" + companies = json.loads(result.extracted_content) + print(f"Successfully extracted {len(companies)} companies") + print(json.dumps(companies[0], indent=2)) - news_teasers = json.loads(result.extracted_content) - print(f"Successfully extracted {len(news_teasers)} news teasers") - print(json.dumps(news_teasers[0], indent=2)) if __name__ == "__main__": - asyncio.run(extract_news_teasers()) + asyncio.run(main()) ``` -For more advanced usage examples, check out our [Examples](https://crawl4ai.com/mkdocs/extraction/css-advanced/) section in the documentation. +
    -### Extracting Structured Data with OpenAI +
    +🤖 Extracting Structured Data with LLMs ```python import os import asyncio -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai.extraction_strategy import LLMExtractionStrategy from pydantic import BaseModel, Field @@ -394,6 +413,8 @@ async def main(): url='https://openai.com/api/pricing/', word_count_threshold=1, extraction_strategy=LLMExtractionStrategy( + # Here you can use any provider that Litellm library supports, for instance: ollama/qwen2 + # provider="ollama/qwen2", api_token="no-token", provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'), schema=OpenAIModelFee.schema(), extraction_type="schema", @@ -401,7 +422,7 @@ async def main(): Do not miss any models in the entire content. One extracted model JSON format should look like this: {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""" ), - bypass_cache=True, + cache_mode=CacheMode.BYPASS, ) print(result.extracted_content) @@ -409,105 +430,86 @@ if __name__ == "__main__": asyncio.run(main()) ``` -### Session Management and Dynamic Content Crawling +
    -Crawl4AI excels at handling complex scenarios, such as crawling multiple pages with dynamic content loaded via JavaScript. Here's an example of crawling GitHub commits across multiple pages: +
    +🤖 Using You own Browswer with Custome User Profile ```python -import asyncio -import re -from bs4 import BeautifulSoup +import os, sys +from pathlib import Path +import asyncio, time from crawl4ai import AsyncWebCrawler -async def crawl_typescript_commits(): - first_commit = "" - async def on_execution_started(page): - nonlocal first_commit - try: - while True: - await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4') - commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4') - commit = await commit.evaluate('(element) => element.textContent') - commit = re.sub(r'\s+', '', commit) - if commit and commit != first_commit: - first_commit = commit - break - await asyncio.sleep(0.5) - except Exception as e: - print(f"Warning: New content didn't appear after JavaScript execution: {e}") +async def test_news_crawl(): + # Create a persistent user data directory + user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile") + os.makedirs(user_data_dir, exist_ok=True) - async with AsyncWebCrawler(verbose=True) as crawler: - crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started) - - url = "https://github.com/microsoft/TypeScript/commits/main" - session_id = "typescript_commits_session" - all_commits = [] - - js_next_page = """ - const button = document.querySelector('a[data-testid="pagination-next-button"]'); - if (button) button.click(); - """ - - for page in range(3): # Crawl 3 pages - result = await crawler.arun( - url=url, - session_id=session_id, - css_selector="li.Box-sc-g0xbh4-0", - js=js_next_page if page > 0 else None, - bypass_cache=True, - js_only=page > 0 - ) - - assert result.success, f"Failed to crawl page {page + 1}" - - soup = BeautifulSoup(result.cleaned_html, 'html.parser') - commits = soup.select("li") - all_commits.extend(commits) - - print(f"Page {page + 1}: Found {len(commits)} commits") - - await crawler.crawler_strategy.kill_session(session_id) - print(f"Successfully crawled {len(all_commits)} commits across 3 pages") - -if __name__ == "__main__": - asyncio.run(crawl_typescript_commits()) + async with AsyncWebCrawler( + verbose=True, + headless=True, + user_data_dir=user_data_dir, + use_persistent_context=True, + headers={ + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Accept-Encoding": "gzip, deflate, br", + "DNT": "1", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Sec-Fetch-User": "?1", + "Cache-Control": "max-age=0", + } + ) as crawler: + url = "ADDRESS_OF_A_CHALLENGING_WEBSITE" + + result = await crawler.arun( + url, + cache_mode=CacheMode.BYPASS, + magic=True, + ) + + print(f"Successfully crawled {url}") + print(f"Content length: {len(result.markdown)}") ``` -This example demonstrates Crawl4AI's ability to handle complex scenarios where content is loaded asynchronously. It crawls multiple pages of GitHub commits, executing JavaScript to load new content and using custom hooks to ensure data is loaded before proceeding. - -For more advanced usage examples, check out our [Examples](https://crawl4ai.com/mkdocs/tutorial/episode_12_Session-Based_Crawling_for_Dynamic_Websites/) section in the documentation.
    ## Speed Comparison 🚀 +A test was conducted on **[NBC News - Business Section](https://www.nbcnews.com/business)** to compare Crawl4AI and Firecrawl, highlighting Crawl4AI's speed, efficiency, and advanced features. -Crawl4AI is designed with speed as a primary focus. Our goal is to provide the fastest possible response with high-quality data extraction, minimizing abstractions between the data and the user. +--- -We've conducted a speed comparison between Crawl4AI and Firecrawl, a paid service. The results demonstrate Crawl4AI's superior performance: +#### Results Summary -```bash -Firecrawl: -Time taken: 7.02 seconds -Content length: 42074 characters -Images found: 49 +| **Method** | **Time Taken** | **Markdown Length** | **Fit Markdown** | **Images Found** | +|--------------------------------|----------------|----------------------|-------------------|------------------| +| **Firecrawl** | 6.04 seconds | 38,382 characters | - | 52 | +| **Crawl4AI (Simple Crawl)** | 1.06 seconds | 42,027 characters | - | 52 | +| **Crawl4AI (Markdown Plus)** | 1.30 seconds | 54,342 characters | 11,119 characters | 52 | +| **Crawl4AI (JavaScript)** | 1.56 seconds | 75,869 characters | 13,406 characters | 92 | -Crawl4AI (simple crawl): -Time taken: 1.60 seconds -Content length: 18238 characters -Images found: 49 +--- -Crawl4AI (with JavaScript execution): -Time taken: 4.64 seconds -Content length: 40869 characters -Images found: 89 -``` +#### Key Takeaways -As you can see, Crawl4AI outperforms Firecrawl significantly: +1. **Superior Speed**: Crawl4AI processes even advanced crawls up to **6x faster** than Firecrawl, with times as low as **1.06 seconds**. +2. **Rich Content Extraction**: Crawl4AI consistently captures more comprehensive content, producing a **Markdown Plus** output of **54,342 characters**, compared to Firecrawl's **38,382 characters**. +3. **AI-Optimized Output**: With **Fit Markdown**, Crawl4AI removes noise to produce concise, AI-friendly outputs (**11,119–13,406 characters**) tailored for LLM workflows. +4. **Dynamic Content Handling**: Using JavaScript execution, Crawl4AI extracted **92 images** and enriched content dynamically loaded via “Load More” buttons—unmatched by Firecrawl. -- Simple crawl: Crawl4AI is over 4 times faster than Firecrawl. -- With JavaScript execution: Even when executing JavaScript to load more content (doubling the number of images found), Crawl4AI is still faster than Firecrawl's simple crawl. +--- -You can find the full comparison code in our repository at `docs/examples/crawl4ai_vs_firecrawl.py`. +#### Conclusion + +Crawl4AI outshines Firecrawl in speed, completeness, and flexibility. Its advanced features, including **Markdown Plus**, **Fit Markdown**, and **dynamic content handling**, make it the ideal choice for AI-ready web crawling. Whether you're targeting rich structured data or handling complex dynamic websites, Crawl4AI delivers unmatched performance and precision. + +You can find the full comparison code in our repository at [docs/examples/quickstart_async.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.py). ## Documentation 📚 diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index d67a8c30..e50fe456 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -13,7 +13,9 @@ import re from typing import Dict, List from bs4 import BeautifulSoup from pydantic import BaseModel, Field -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.content_filter_strategy import BM25ContentFilter from crawl4ai.extraction_strategy import ( JsonCssExtractionStrategy, LLMExtractionStrategy, @@ -51,7 +53,7 @@ async def simple_example_with_running_js_code(): url="https://www.nbcnews.com/business", js_code=js_code, # wait_for=wait_for, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, ) print(result.markdown[:500]) # Print first 500 characters @@ -61,7 +63,7 @@ async def simple_example_with_css_selector(): result = await crawler.arun( url="https://www.nbcnews.com/business", css_selector=".wide-tease-item__description", - bypass_cache=True, + cache_mode=CacheMode.BYPASS, ) print(result.markdown[:500]) # Print first 500 characters @@ -132,7 +134,7 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""", extra_args=extra_args ), - bypass_cache=True, + cache_mode=CacheMode.BYPASS, ) print(result.extracted_content) @@ -166,7 +168,7 @@ async def extract_structured_data_using_css_extractor(): result = await crawler.arun( url="https://www.coinbase.com/explore", extraction_strategy=extraction_strategy, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, ) assert result.success, "Failed to crawl the page" @@ -213,7 +215,7 @@ async def crawl_dynamic_content_pages_method_1(): session_id=session_id, css_selector="li.Box-sc-g0xbh4-0", js=js_next_page if page > 0 else None, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, js_only=page > 0, headless=False, ) @@ -282,7 +284,7 @@ async def crawl_dynamic_content_pages_method_2(): extraction_strategy=extraction_strategy, js_code=js_next_page_and_wait if page > 0 else None, js_only=page > 0, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, headless=False, ) @@ -343,7 +345,7 @@ async def crawl_dynamic_content_pages_method_3(): js_code=js_next_page if page > 0 else None, wait_for=wait_for if page > 0 else None, js_only=page > 0, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, headless=False, ) @@ -384,7 +386,7 @@ async def crawl_with_user_simultion(): url = "YOUR-URL-HERE" result = await crawler.arun( url=url, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, magic = True, # Automatically detects and removes overlays, popups, and other elements that block content # simulate_user = True,# Causes a series of random mouse movements and clicks to simulate user interaction # override_navigator = True # Overrides the navigator object to make it look like a real user @@ -408,7 +410,7 @@ async def speed_comparison(): params={'formats': ['markdown', 'html']} ) end = time.time() - print("Firecrawl (simulated):") + print("Firecrawl:") print(f"Time taken: {end - start:.2f} seconds") print(f"Content length: {len(scrape_status['markdown'])} characters") print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}") @@ -420,7 +422,7 @@ async def speed_comparison(): result = await crawler.arun( url="https://www.nbcnews.com/business", word_count_threshold=0, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, verbose=False, ) end = time.time() @@ -430,6 +432,25 @@ async def speed_comparison(): print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") print() + # Crawl4AI with advanced content filtering + start = time.time() + result = await crawler.arun( + url="https://www.nbcnews.com/business", + word_count_threshold=0, + markdown_generator=DefaultMarkdownGenerator( + content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0) + ), + cache_mode=CacheMode.BYPASS, + verbose=False, + ) + end = time.time() + print("Crawl4AI (Markdown Plus):") + print(f"Time taken: {end - start:.2f} seconds") + print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters") + print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters") + print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") + print() + # Crawl4AI with JavaScript execution start = time.time() result = await crawler.arun( @@ -438,13 +459,17 @@ async def speed_comparison(): "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" ], word_count_threshold=0, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, + markdown_generator=DefaultMarkdownGenerator( + content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0) + ), verbose=False, ) end = time.time() print("Crawl4AI (with JavaScript execution):") print(f"Time taken: {end - start:.2f} seconds") print(f"Content length: {len(result.markdown)} characters") + print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters") print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") print("\nNote on Speed Comparison:") @@ -483,7 +508,7 @@ async def generate_knowledge_graph(): url = "https://paulgraham.com/love.html" result = await crawler.arun( url=url, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, extraction_strategy=extraction_strategy, # magic=True ) @@ -496,7 +521,7 @@ async def fit_markdown_remove_overlay(): url = "https://janineintheworld.com/places-to-visit-in-central-mexico" result = await crawler.arun( url=url, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, word_count_threshold = 10, remove_overlay_elements=True, screenshot = True @@ -509,31 +534,31 @@ async def fit_markdown_remove_overlay(): async def main(): - await simple_crawl() - await simple_example_with_running_js_code() - await simple_example_with_css_selector() - await use_proxy() - await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) - await extract_structured_data_using_css_extractor() + # await simple_crawl() + # await simple_example_with_running_js_code() + # await simple_example_with_css_selector() + # await use_proxy() + # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) + # await extract_structured_data_using_css_extractor() - # LLM extraction examples - await extract_structured_data_using_llm() - await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) - await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) - await extract_structured_data_using_llm("ollama/llama3.2") + # # LLM extraction examples + # await extract_structured_data_using_llm() + # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) + # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) + # await extract_structured_data_using_llm("ollama/llama3.2") - # You always can pass custom headers to the extraction strategy - custom_headers = { - "Authorization": "Bearer your-custom-token", - "X-Custom-Header": "Some-Value" - } - await extract_structured_data_using_llm(extra_headers=custom_headers) + # # You always can pass custom headers to the extraction strategy + # custom_headers = { + # "Authorization": "Bearer your-custom-token", + # "X-Custom-Header": "Some-Value" + # } + # await extract_structured_data_using_llm(extra_headers=custom_headers) - # await crawl_dynamic_content_pages_method_1() - # await crawl_dynamic_content_pages_method_2() - await crawl_dynamic_content_pages_method_3() + # # await crawl_dynamic_content_pages_method_1() + # # await crawl_dynamic_content_pages_method_2() + # await crawl_dynamic_content_pages_method_3() - await crawl_custom_browser_type() + # await crawl_custom_browser_type() await speed_comparison() From a69f7a953198df1d9d93420161794aafe3fcffcb Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 16:31:41 +0800 Subject: [PATCH 082/115] fix: correct typo in function documentation for clarity and accuracy --- README.md | 184 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 105 insertions(+), 79 deletions(-) diff --git a/README.md b/README.md index 5c50cdc5..c4ef1bd3 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scraper +[✨ Check out what's new in the latest update!](#new-in-03743) + unclecode%2Fcrawl4ai | Trendshift [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers) @@ -9,26 +11,47 @@ [![GitHub Pull Requests](https://img.shields.io/github/issues-pr/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/pulls) [![License](https://img.shields.io/github/license/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/blob/main/LICENSE) -Crawl4AI simplifies asynchronous web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. 🆓🌐 +## 🔥 Crawl4AI: Crawl Smarter, Faster, Freely. For AI. -## New in 0.3.743 ✨ +Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -- 🚀 **Improved ManagedBrowser Configuration**: Dynamic host and port support for more flexible browser management. -- 📝 **Enhanced Markdown Generation**: New generator class for better formatting and customization. -- ⚡ **Fast HTML Formatting**: Significantly optimized HTML formatting in the web crawler. -- 🛠️ **Utility & Sanitization Upgrades**: Improved sanitization and expanded utility functions for streamlined workflows. -- 👥 **Acknowledgments**: Added contributor details and pull request acknowledgments for better transparency. +[✨ Check out what's new in the latest update!](#new-in-03743) + +## 🧐 Why Crawl4AI? + +1. **Built for LLMs**: Creates **smart, concise Markdown** optimized for applications like Retrieval-Augmented Generation (RAG) and fine-tuning. +2. **Lightning Fast**: Delivers results **6x faster** than competitors with real-time, cost-efficient performance. +3. **Flexible Browser Control**: Offers session management, proxies, and custom hooks for precise, seamless data access. +4. **Heuristic Intelligence**: Leverages **advanced algorithms** to extract data efficiently, reducing reliance on costly language models. +5. **Open Source & Deployable**: 100% open-source with no API keys or registration required-ready for **Docker and cloud integration**. +6. **Thriving Community**: Actively maintained by a vibrant developer community and the **#1 trending GitHub repository** across all languages. -## Try it Now! +## 🚀 Quick Start -✨ Play around with this [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing) +1. Install Crawl4AI: +```bash +pip install crawl4ai +``` -✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/) +2. Run a simple web crawl: +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CacheMode -## Features ✨ +async def main(): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun(url="https://www.nbcnews.com/business") + # Soone will be change to result.markdown + print(result.markdown_v2.raw_markdown) -
    +if __name__ == "__main__": + asyncio.run(main()) +``` + +## ✨ Features + +
    📝 Markdown Generation - 🧹 **Clean Markdown**: Generates clean, structured Markdown with accurate formatting. @@ -38,7 +61,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc - 📚 **BM25 Algorithm**: Employs BM25-based filtering for extracting core information and removing irrelevant content.
    -
    +
    📊 Structured Data Extraction - 🤖 **LLM-Driven Extraction**: Supports all LLMs (open-source and proprietary) for structured data extraction. @@ -49,7 +72,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
    -
    +
    🌐 Browser Integration - 🖥️ **Managed Browser**: Use user-owned browsers with full control, avoiding bot detection. @@ -61,7 +84,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
    -
    +
    🔎 Crawling & Scraping - 🖼️ **Media Support**: Extract images, audio, videos, and responsive image formats like `srcset` and `picture`. @@ -76,7 +99,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
    -
    +
    🚀 Deployment - 🐳 **Dockerized Setup**: Optimized Docker image with API server for easy deployment. @@ -99,7 +122,54 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
    -## Installation 🛠️ + + +## Try it Now! + +✨ Play around with this [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing) + +✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/) + + +## 🚀 Speed Comparison + +A test was conducted on **[NBC News - Business Section](https://www.nbcnews.com/business)** to compare Crawl4AI and Firecrawl, highlighting Crawl4AI's speed, efficiency, and advanced features. + +
    +📊 Results Summary + +#### Results Summary + +| **Method** | **Time Taken** | **Markdown Length** | **Fit Markdown** | **Images Found** | +|--------------------------------|----------------|----------------------|-------------------|------------------| +| **Firecrawl** | 6.04 seconds | 38,382 characters | - | 52 | +| **Crawl4AI (Simple Crawl)** | 1.06 seconds | 42,027 characters | - | 52 | +| **Crawl4AI (Markdown Plus)** | 1.30 seconds | 54,342 characters | 11,119 characters | 52 | +| **Crawl4AI (JavaScript)** | 1.56 seconds | 75,869 characters | 13,406 characters | 92 | + +
    + +
    +Key Takeaways + +1. **Superior Speed**: Crawl4AI processes even advanced crawls up to **6x faster** than Firecrawl, with times as low as **1.06 seconds**. +2. **Rich Content Extraction**: Crawl4AI consistently captures more comprehensive content, producing a **Markdown Plus** output of **54,342 characters**, compared to Firecrawl's **38,382 characters**. +3. **AI-Optimized Output**: With **Fit Markdown**, Crawl4AI removes noise to produce concise, AI-friendly outputs (**11,119–13,406 characters**) tailored for LLM workflows. +4. **Dynamic Content Handling**: Using JavaScript execution, Crawl4AI extracted **92 images** and enriched content dynamically loaded via “Load More” buttons—unmatched by Firecrawl. + +
    + +
    +🏁 Conclusion + +Crawl4AI outshines Firecrawl in speed, completeness, and flexibility. Its advanced features, including **Markdown Plus**, **Fit Markdown**, and **dynamic content handling**, make it the ideal choice for AI-ready web crawling. Whether you're targeting rich structured data or handling complex dynamic websites, Crawl4AI delivers unmatched performance and precision. + +You can find the full comparison code in our repository at [docs/examples/quickstart_async.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.py). + +
    + + +## 🛠️ Installation 🛠️ Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker. @@ -259,27 +329,14 @@ For advanced configuration, environment variables, and usage examples, see our [
    -## Quick Start 🚀 -```python -import asyncio -from crawl4ai import AsyncWebCrawler, CacheMode -async def main(): - async with AsyncWebCrawler(verbose=True) as crawler: - result = await crawler.arun(url="https://www.nbcnews.com/business") - print(result.markdown_v2.raw_markdown) # Soone will be change to result.markdown - -if __name__ == "__main__": - asyncio.run(main()) -``` - -## Advanced Usage Examples 🔬 +## 🔬 Advanced Usage Examples 🔬 You can check the project structure in the directory [https://github.com/unclecode/crawl4ai/docs/examples](docs/examples). Over there, you can find a variety of examples; here, some popular examples are shared.
    -🖥️ Heuristic Markdown Generation with Clean and Fit Markdown +📝 Heuristic Markdown Generation with Clean and Fit Markdown ```python import asyncio @@ -310,7 +367,7 @@ if __name__ == "__main__":
    -🖥️ Structured Data Extraction and Executing JavaScript +🖥️ Executing JavaScript & Extract Structured Data without LLMs ```python import asyncio @@ -393,7 +450,7 @@ if __name__ == "__main__":
    -🤖 Extracting Structured Data with LLMs +📚 Extracting Structured Data with LLMs ```python import os @@ -480,74 +537,43 @@ async def test_news_crawl():
    -## Speed Comparison 🚀 -A test was conducted on **[NBC News - Business Section](https://www.nbcnews.com/business)** to compare Crawl4AI and Firecrawl, highlighting Crawl4AI's speed, efficiency, and advanced features. +## ✨ New in 0.3.743 ---- +- 🚀 **Improved ManagedBrowser Configuration**: Dynamic host and port support for more flexible browser management. +- 📝 **Enhanced Markdown Generation**: New generator class for better formatting and customization. +- ⚡ **Fast HTML Formatting**: Significantly optimized HTML formatting in the web crawler. +- 🛠️ **Utility & Sanitization Upgrades**: Improved sanitization and expanded utility functions for streamlined workflows. +- 👥 **Acknowledgments**: Added contributor details and pull request acknowledgments for better transparency. -#### Results Summary -| **Method** | **Time Taken** | **Markdown Length** | **Fit Markdown** | **Images Found** | -|--------------------------------|----------------|----------------------|-------------------|------------------| -| **Firecrawl** | 6.04 seconds | 38,382 characters | - | 52 | -| **Crawl4AI (Simple Crawl)** | 1.06 seconds | 42,027 characters | - | 52 | -| **Crawl4AI (Markdown Plus)** | 1.30 seconds | 54,342 characters | 11,119 characters | 52 | -| **Crawl4AI (JavaScript)** | 1.56 seconds | 75,869 characters | 13,406 characters | 92 | - ---- - -#### Key Takeaways - -1. **Superior Speed**: Crawl4AI processes even advanced crawls up to **6x faster** than Firecrawl, with times as low as **1.06 seconds**. -2. **Rich Content Extraction**: Crawl4AI consistently captures more comprehensive content, producing a **Markdown Plus** output of **54,342 characters**, compared to Firecrawl's **38,382 characters**. -3. **AI-Optimized Output**: With **Fit Markdown**, Crawl4AI removes noise to produce concise, AI-friendly outputs (**11,119–13,406 characters**) tailored for LLM workflows. -4. **Dynamic Content Handling**: Using JavaScript execution, Crawl4AI extracted **92 images** and enriched content dynamically loaded via “Load More” buttons—unmatched by Firecrawl. - ---- - -#### Conclusion - -Crawl4AI outshines Firecrawl in speed, completeness, and flexibility. Its advanced features, including **Markdown Plus**, **Fit Markdown**, and **dynamic content handling**, make it the ideal choice for AI-ready web crawling. Whether you're targeting rich structured data or handling complex dynamic websites, Crawl4AI delivers unmatched performance and precision. - -You can find the full comparison code in our repository at [docs/examples/quickstart_async.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.py). - -## Documentation 📚 +## 📖 Documentation & Roadmap For detailed documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/). -## Crawl4AI Roadmap 🗺️ +Moreover to check our development plans and upcoming features, check out our [Roadmap](https://github.com/unclecode/crawl4ai/blob/main/ROADMAP.md). -For detailed information on our development plans and upcoming features, check out our [Roadmap](https://github.com/unclecode/crawl4ai/blob/main/ROADMAP.md). - -### Advanced Crawling Systems 🔧 - [x] 0. Graph Crawler: Smart website traversal using graph search algorithms for comprehensive nested page extraction - [ ] 1. Question-Based Crawler: Natural language driven web discovery and content extraction - [ ] 2. Knowledge-Optimal Crawler: Smart crawling that maximizes knowledge while minimizing data extraction - [ ] 3. Agentic Crawler: Autonomous system for complex multi-step crawling operations - -### Specialized Features 🛠️ - [ ] 4. Automated Schema Generator: Convert natural language to extraction schemas - [ ] 5. Domain-Specific Scrapers: Pre-configured extractors for common platforms (academic, e-commerce) - [ ] 6. Web Embedding Index: Semantic search infrastructure for crawled content - -### Development Tools 🔨 - [ ] 7. Interactive Playground: Web UI for testing, comparing strategies with AI assistance - [ ] 8. Performance Monitor: Real-time insights into crawler operations - [ ] 9. Cloud Integration: One-click deployment solutions across cloud providers - -### Community & Growth 🌱 - [ ] 10. Sponsorship Program: Structured support system with tiered benefits - [ ] 11. Educational Content: "How to Crawl" video series and interactive tutorials -## Contributing 🤝 +## 🤝 Contributing We welcome contributions from the open-source community. Check out our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTING.md) for more information. -## License 📄 +## 📄 License Crawl4AI is released under the [Apache 2.0 License](https://github.com/unclecode/crawl4ai/blob/main/LICENSE). -## Contact 📧 +## 📧 Contact For questions, suggestions, or feedback, feel free to reach out: @@ -558,7 +584,7 @@ For questions, suggestions, or feedback, feel free to reach out: Happy Crawling! 🕸️🚀 -# Mission +## 🗾 Mission Our mission is to unlock the untapped potential of personal and enterprise data in the digital age. In today's world, individuals and organizations generate vast amounts of valuable digital footprints, yet this data remains largely uncapitalized as a true asset. @@ -570,13 +596,13 @@ This democratization of data represents the first step toward a shared data econ For a detailed exploration of our vision, opportunities, and pathway forward, please see our [full mission statement](./MISSION.md). -## Key Opportunities +### Key Opportunities - **Data Capitalization**: Transform digital footprints into valuable assets that can appear on personal and enterprise balance sheets - **Authentic Data**: Unlock the vast reservoir of real human insights and knowledge for AI advancement - **Shared Economy**: Create new value streams where data creators directly benefit from their contributions -## Development Pathway +### Development Pathway 1. **Open-Source Foundation**: Building transparent, community-driven data extraction tools 2. **Data Capitalization Platform**: Creating tools to structure and value digital assets From ddfb6707b47b6be786c2115cd7511b3d94d89e7c Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 16:34:08 +0800 Subject: [PATCH 083/115] docs: update README to reflect new branding and improve section headings for clarity --- README.md | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index c4ef1bd3..ed6892ec 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,4 @@ -# 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scraper - -[✨ Check out what's new in the latest update!](#new-in-03743) +# 🔥🕷️ Crawl4AI: Crawl Smarter, Faster, Freely. For AI. unclecode%2Fcrawl4ai | Trendshift @@ -11,11 +9,9 @@ [![GitHub Pull Requests](https://img.shields.io/github/issues-pr/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/pulls) [![License](https://img.shields.io/github/license/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/blob/main/LICENSE) -## 🔥 Crawl4AI: Crawl Smarter, Faster, Freely. For AI. - Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -[✨ Check out what's new in the latest update!](#new-in-03743) +[✨ Check out what's new in the latest update!](#recent-updates) ## 🧐 Why Crawl4AI? @@ -537,7 +533,7 @@ async def test_news_crawl():
    -## ✨ New in 0.3.743 +## ✨ Recent Updates - 🚀 **Improved ManagedBrowser Configuration**: Dynamic host and port support for more flexible browser management. - 📝 **Enhanced Markdown Generation**: New generator class for better formatting and customization. From 3fda66b85b793655a92b3627599472f4d3279b0b Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 16:36:24 +0800 Subject: [PATCH 084/115] docs: refine README content for clarity and conciseness, improving descriptions and formatting --- README.md | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index ed6892ec..7bf4b4a4 100644 --- a/README.md +++ b/README.md @@ -15,13 +15,12 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant ## 🧐 Why Crawl4AI? -1. **Built for LLMs**: Creates **smart, concise Markdown** optimized for applications like Retrieval-Augmented Generation (RAG) and fine-tuning. -2. **Lightning Fast**: Delivers results **6x faster** than competitors with real-time, cost-efficient performance. -3. **Flexible Browser Control**: Offers session management, proxies, and custom hooks for precise, seamless data access. -4. **Heuristic Intelligence**: Leverages **advanced algorithms** to extract data efficiently, reducing reliance on costly language models. -5. **Open Source & Deployable**: 100% open-source with no API keys or registration required-ready for **Docker and cloud integration**. -6. **Thriving Community**: Actively maintained by a vibrant developer community and the **#1 trending GitHub repository** across all languages. - +1. **Built for LLMs**: Creates smart, concise Markdown optimized for RAG and fine-tuning applications. +2. **Lightning Fast**: Delivers results 6x faster with real-time, cost-efficient performance. +3. **Flexible Browser Control**: Offers session management, proxies, and custom hooks for seamless data access. +4. **Heuristic Intelligence**: Uses advanced algorithms for efficient extraction, reducing reliance on costly models. +5. **Open Source & Deployable**: Fully open-source with no API keys—ready for Docker and cloud integration. +6. **Thriving Community**: Actively maintained by a vibrant community and the #1 trending GitHub repository. ## 🚀 Quick Start @@ -145,7 +144,7 @@ A test was conducted on **[NBC News - Business Section](https://www.nbcnews.com/
    -
    +
    Key Takeaways 1. **Superior Speed**: Crawl4AI processes even advanced crawls up to **6x faster** than Firecrawl, with times as low as **1.06 seconds**. @@ -155,7 +154,7 @@ A test was conducted on **[NBC News - Business Section](https://www.nbcnews.com/
    -
    +
    🏁 Conclusion Crawl4AI outshines Firecrawl in speed, completeness, and flexibility. Its advanced features, including **Markdown Plus**, **Fit Markdown**, and **dynamic content handling**, make it the ideal choice for AI-ready web crawling. Whether you're targeting rich structured data or handling complex dynamic websites, Crawl4AI delivers unmatched performance and precision. @@ -169,7 +168,7 @@ You can find the full comparison code in our repository at [docs/examples/quicks Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker. -
    +
    🐍 Using pip Choose the installation option that best fits your needs: @@ -234,7 +233,7 @@ pip install -e ".[all]" # Install all optional features
    -
    +
    🚀 One-Click Deployment Deploy your own instance of Crawl4AI with one click: @@ -251,7 +250,7 @@ The deploy will:
    -
    +
    🐳 Using Docker Crawl4AI is available as Docker images for easy deployment. You can either pull directly from Docker Hub (recommended) or build from the repository. @@ -325,13 +324,11 @@ For advanced configuration, environment variables, and usage examples, see our [
    - - ## 🔬 Advanced Usage Examples 🔬 You can check the project structure in the directory [https://github.com/unclecode/crawl4ai/docs/examples](docs/examples). Over there, you can find a variety of examples; here, some popular examples are shared. -
    +
    📝 Heuristic Markdown Generation with Clean and Fit Markdown ```python @@ -362,7 +359,7 @@ if __name__ == "__main__":
    -
    +
    🖥️ Executing JavaScript & Extract Structured Data without LLMs ```python @@ -445,7 +442,7 @@ if __name__ == "__main__":
    -
    +
    📚 Extracting Structured Data with LLMs ```python @@ -485,7 +482,7 @@ if __name__ == "__main__":
    -
    +
    🤖 Using You own Browswer with Custome User Profile ```python From efe93a5f57ebe677cc12dca90549525626a85b98 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 16:41:11 +0800 Subject: [PATCH 085/115] docs: enhance README with development TODOs and refine mission statement for clarity --- README.md | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 7bf4b4a4..20395b58 100644 --- a/README.md +++ b/README.md @@ -545,6 +545,9 @@ For detailed documentation, including installation instructions, advanced featur Moreover to check our development plans and upcoming features, check out our [Roadmap](https://github.com/unclecode/crawl4ai/blob/main/ROADMAP.md). +
    +📈 Development TODOs + - [x] 0. Graph Crawler: Smart website traversal using graph search algorithms for comprehensive nested page extraction - [ ] 1. Question-Based Crawler: Natural language driven web discovery and content extraction - [ ] 2. Knowledge-Optimal Crawler: Smart crawling that maximizes knowledge while minimizing data extraction @@ -558,6 +561,8 @@ Moreover to check our development plans and upcoming features, check out our [Ro - [ ] 10. Sponsorship Program: Structured support system with tiered benefits - [ ] 11. Educational Content: "How to Crawl" video series and interactive tutorials +
    + ## 🤝 Contributing We welcome contributions from the open-source community. Check out our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTING.md) for more information. @@ -576,32 +581,32 @@ For questions, suggestions, or feedback, feel free to reach out: Happy Crawling! 🕸️🚀 - ## 🗾 Mission -Our mission is to unlock the untapped potential of personal and enterprise data in the digital age. In today's world, individuals and organizations generate vast amounts of valuable digital footprints, yet this data remains largely uncapitalized as a true asset. +Our mission is to unlock the value of personal and enterprise data by transforming digital footprints into structured, tradeable assets. Crawl4AI empowers individuals and organizations with open-source tools to extract and structure data, fostering a shared data economy. -Our open-source solution empowers developers and innovators to build tools for data extraction and structuring, laying the foundation for a new era of data ownership. By transforming personal and enterprise data into structured, tradeable assets, we're creating opportunities for individuals to capitalize on their digital footprints and for organizations to unlock the value of their collective knowledge. +We envision a future where AI is powered by real human knowledge, ensuring data creators directly benefit from their contributions. By democratizing data and enabling ethical sharing, we are laying the foundation for authentic AI advancement. -This democratization of data represents the first step toward a shared data economy, where willing participation in data sharing drives AI advancement while ensuring the benefits flow back to data creators. Through this approach, we're building a future where AI development is powered by authentic human knowledge rather than synthetic alternatives. +
    +🔑 Key Opportunities + +- **Data Capitalization**: Transform digital footprints into measurable, valuable assets. +- **Authentic AI Data**: Provide AI systems with real human insights. +- **Shared Economy**: Create a fair data marketplace that benefits data creators. -![Mission Diagram](./docs/assets/pitch-dark.svg) +
    -For a detailed exploration of our vision, opportunities, and pathway forward, please see our [full mission statement](./MISSION.md). +
    +🚀 Development Pathway -### Key Opportunities +1. **Open-Source Tools**: Community-driven platforms for transparent data extraction. +2. **Digital Asset Structuring**: Tools to organize and value digital knowledge. +3. **Ethical Data Marketplace**: A secure, fair platform for exchanging structured data. -- **Data Capitalization**: Transform digital footprints into valuable assets that can appear on personal and enterprise balance sheets -- **Authentic Data**: Unlock the vast reservoir of real human insights and knowledge for AI advancement -- **Shared Economy**: Create new value streams where data creators directly benefit from their contributions +For more details, see our [full mission statement](./MISSION.md). +
    -### Development Pathway -1. **Open-Source Foundation**: Building transparent, community-driven data extraction tools -2. **Data Capitalization Platform**: Creating tools to structure and value digital assets -3. **Shared Data Marketplace**: Establishing an economic platform for ethical data exchange - -For a detailed exploration of our vision, challenges, and solutions, please see our [full mission statement](./MISSION.md). ## Star History From 0bccf23db3f90bf07342f34591c91b92eb1cdf89 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 18:19:42 +0800 Subject: [PATCH 086/115] docs: update quickstart_async.py to enable example function calls for better demonstration --- docs/examples/quickstart_async.py | 36 +++++++++++++++---------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index e50fe456..9f1eff53 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -534,31 +534,31 @@ async def fit_markdown_remove_overlay(): async def main(): - # await simple_crawl() - # await simple_example_with_running_js_code() - # await simple_example_with_css_selector() - # await use_proxy() - # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) - # await extract_structured_data_using_css_extractor() + await simple_crawl() + await simple_example_with_running_js_code() + await simple_example_with_css_selector() + await use_proxy() + await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) + await extract_structured_data_using_css_extractor() - # # LLM extraction examples + # LLM extraction examples # await extract_structured_data_using_llm() # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) - # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) # await extract_structured_data_using_llm("ollama/llama3.2") + await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) - # # You always can pass custom headers to the extraction strategy - # custom_headers = { - # "Authorization": "Bearer your-custom-token", - # "X-Custom-Header": "Some-Value" - # } - # await extract_structured_data_using_llm(extra_headers=custom_headers) + # You always can pass custom headers to the extraction strategy + custom_headers = { + "Authorization": "Bearer your-custom-token", + "X-Custom-Header": "Some-Value" + } + await extract_structured_data_using_llm(extra_headers=custom_headers) - # # await crawl_dynamic_content_pages_method_1() - # # await crawl_dynamic_content_pages_method_2() - # await crawl_dynamic_content_pages_method_3() + # await crawl_dynamic_content_pages_method_1() + # await crawl_dynamic_content_pages_method_2() + await crawl_dynamic_content_pages_method_3() - # await crawl_custom_browser_type() + await crawl_custom_browser_type() await speed_comparison() From a036b7f12224d6a424118e3d113e49ab1e2c9e13 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 19:24:07 +0800 Subject: [PATCH 087/115] feat: implement create_box_message utility for formatted error messages and enhance error logging in AsyncWebCrawler --- crawl4ai/async_crawler_strategy.py | 21 +++++----- crawl4ai/async_webcrawler.py | 8 ++-- crawl4ai/utils.py | 64 ++++++++++++++++++++++++++++-- 3 files changed, 77 insertions(+), 16 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 882f9a50..e5316187 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -15,7 +15,7 @@ import hashlib import json import uuid from .models import AsyncCrawlResponse - +from .utils import create_box_message from playwright_stealth import StealthConfig, stealth_async stealth_config = StealthConfig( @@ -321,10 +321,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "--disable-infobars", "--window-position=0,0", "--ignore-certificate-errors", - "--ignore-certificate-errors-spki-list", + "--ignore-certificate-errors-spki-list" ] } - + # Add channel if specified (try Chrome first) if self.chrome_channel: browser_args["channel"] = self.chrome_channel @@ -765,12 +765,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.execute_hook('before_goto', page, context = context) - response = await page.goto( - url, - # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), - wait_until=kwargs.get("wait_until", "domcontentloaded"), - timeout=kwargs.get("page_timeout", 60000) - ) + try: + response = await page.goto( + url, + # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), + wait_until=kwargs.get("wait_until", "domcontentloaded"), + timeout=kwargs.get("page_timeout", 60000), + ) + except Error as e: + raise RuntimeError(f"Failed on navigating ACS-GOTO :\n{str(e)}") # response = await page.goto("about:blank") # await page.evaluate(f"window.location.href = '{url}'") diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 5a46fe39..66b4c21b 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -26,8 +26,10 @@ from .utils import ( sanitize_input_encode, InvalidCSSSelectorError, format_html, - fast_format_html + fast_format_html, + create_box_message ) + from urllib.parse import urlparse import random from .__version__ import __version__ as crawl4ai_version @@ -326,15 +328,15 @@ class AsyncWebCrawler: if not hasattr(e, "msg"): e.msg = str(e) # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") + self.logger.error_status( url=cache_context.display_url, - error=e.msg, + error=create_box_message(e.msg, type = "error"), tag="ERROR" ) return CrawlResult( url=url, html="", - markdown=f"[ERROR] 🚫 arun(): Failed to crawl {cache_context.display_url}, error: {e.msg}", success=False, error_message=e.msg ) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index aaf27e91..253ec079 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -17,7 +17,8 @@ from requests.exceptions import InvalidSchema import hashlib from typing import Optional, Tuple, Dict, Any import xxhash - +from colorama import Fore, Style, init +import textwrap from .html2text import HTML2Text class CustomHTML2Text(HTML2Text): @@ -103,12 +104,67 @@ class CustomHTML2Text(HTML2Text): self.preserved_content.append(data) return super().handle_data(data, entity_char) - - - class InvalidCSSSelectorError(Exception): pass + +def create_box_message( + message: str, + type: str = "info", + width: int = 80, + add_newlines: bool = True, + double_line: bool = False +) -> str: + init() + + # Define border and text colors for different types + styles = { + "warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, "⚠"), + "info": (Fore.BLUE, Fore.LIGHTBLUE_EX, "ℹ"), + "success": (Fore.GREEN, Fore.LIGHTGREEN_EX, "✓"), + "error": (Fore.RED, Fore.LIGHTRED_EX, "×"), + } + + border_color, text_color, prefix = styles.get(type.lower(), styles["info"]) + + # Define box characters based on line style + box_chars = { + "single": ("─", "│", "┌", "┐", "└", "┘"), + "double": ("═", "║", "╔", "╗", "╚", "╝") + } + line_style = "double" if double_line else "single" + h_line, v_line, tl, tr, bl, br = box_chars[line_style] + + # Process lines with lighter text color + formatted_lines = [] + raw_lines = message.split('\n') + + if raw_lines: + first_line = f"{prefix} {raw_lines[0].strip()}" + wrapped_first = textwrap.fill(first_line, width=width-4) + formatted_lines.extend(wrapped_first.split('\n')) + + for line in raw_lines[1:]: + if line.strip(): + wrapped = textwrap.fill(f" {line.strip()}", width=width-4) + formatted_lines.extend(wrapped.split('\n')) + else: + formatted_lines.append("") + + # Create the box with colored borders and lighter text + horizontal_line = h_line * (width - 1) + box = [ + f"{border_color}{tl}{horizontal_line}{tr}", + *[f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}" for line in formatted_lines], + f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}" + ] + + result = "\n".join(box) + if add_newlines: + result = f"\n{result}\n" + + return result + def calculate_semaphore_count(): cpu_count = os.cpu_count() memory_gb = get_system_memory() / (1024 ** 3) # Convert to GB From a9b6b6523812333400fd66730ce3e3c184ad79e2 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 19:26:50 +0800 Subject: [PATCH 088/115] chore: update version to 0.3.744 and add publish.sh to .gitignore --- .gitignore | 1 + crawl4ai/__version__.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 8e96fa82..52e25a2a 100644 --- a/.gitignore +++ b/.gitignore @@ -214,3 +214,4 @@ git_issues.md todo_executor.md protect-all-except-feature.sh manage-collab.sh +publish.sh \ No newline at end of file diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 073b371c..e38cc61b 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.743" +__version__ = "0.3.744" From b14e83f49951cba097e67464546ba2b4f2787cdc Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 19:31:09 +0800 Subject: [PATCH 089/115] docs: fix link formatting for recent updates section in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d22d8940..26cc9fcc 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -[✨ Check out what's new in the latest update!](#recent-updates) +[✨ Check out what's new in the latest update!](#-recent-updates) ## 🧐 Why Crawl4AI? From 776efa74a4c9fde71377f986cc69b201632a59c0 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 19:32:32 +0800 Subject: [PATCH 090/115] docs: fix link formatting for recent updates section in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 26cc9fcc..01197868 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -[✨ Check out what's new in the latest update!](#-recent-updates) +[✨ Check out what's new in the latest update!](#--recent-updates) ## 🧐 Why Crawl4AI? From 48d43c14b1864b87866e8114f5c4fc6e415b6e51 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 19:33:02 +0800 Subject: [PATCH 091/115] docs: fix link formatting for recent updates section in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 01197868..26cc9fcc 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -[✨ Check out what's new in the latest update!](#--recent-updates) +[✨ Check out what's new in the latest update!](#-recent-updates) ## 🧐 Why Crawl4AI? From 9221c08418bbfaa0d0cf48b4f933e3a2ae722f3a Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 19:33:36 +0800 Subject: [PATCH 092/115] docs: fix link formatting for recent updates section in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d22d8940..26cc9fcc 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -[✨ Check out what's new in the latest update!](#recent-updates) +[✨ Check out what's new in the latest update!](#-recent-updates) ## 🧐 Why Crawl4AI? From cf35cbe59e39222b8e3c76ffadc67a7fea55df7a Mon Sep 17 00:00:00 2001 From: Paulo Kuong Date: Thu, 28 Nov 2024 06:46:36 -0500 Subject: [PATCH 093/115] CRAWL4_AI_BASE_DIRECTORY should be Path object instead of string (#298) Thank you so much for your point. Yes, that's correct. I accept your pull request, and I add your name to a contribution list. Thank you again. --- setup.py | 50 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/setup.py b/setup.py index f5f3cf2d..796c3bf9 100644 --- a/setup.py +++ b/setup.py @@ -9,10 +9,16 @@ import asyncio # Create the .crawl4ai folder in the user's home directory if it doesn't exist # If the folder already exists, remove the cache folder -crawl4ai_folder = os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()) / ".crawl4ai" +crawl4ai_folder = Path(os.getenv("CRAWL4_AI_BASE_DIRECTORY")) or Path.home() +crawl4ai_folder = crawl4ai_folder / ".crawl4ai" cache_folder = crawl4ai_folder / "cache" -content_folders = ['html_content', 'cleaned_html', 'markdown_content', - 'extracted_content', 'screenshots'] +content_folders = [ + "html_content", + "cleaned_html", + "markdown_content", + "extracted_content", + "screenshots", +] # Clean up old cache if exists if cache_folder.exists(): @@ -28,7 +34,7 @@ for folder in content_folders: __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) with open(os.path.join(__location__, "requirements.txt")) as f: requirements = f.read().splitlines() - + with open("crawl4ai/__version__.py") as f: for line in f: if line.startswith("__version__"): @@ -37,11 +43,12 @@ with open("crawl4ai/__version__.py") as f: # Define requirements default_requirements = requirements -torch_requirements = ["torch", "nltk", "scikit-learn"] +torch_requirements = ["torch", "nltk", "scikit-learn"] transformer_requirements = ["transformers", "tokenizers"] -cosine_similarity_requirements = ["torch", "transformers", "nltk" ] +cosine_similarity_requirements = ["torch", "transformers", "nltk"] sync_requirements = ["selenium"] + def install_playwright(): print("Installing Playwright browsers...") try: @@ -49,16 +56,22 @@ def install_playwright(): print("Playwright installation completed successfully.") except subprocess.CalledProcessError as e: print(f"Error during Playwright installation: {e}") - print("Please run 'python -m playwright install' manually after the installation.") + print( + "Please run 'python -m playwright install' manually after the installation." + ) except Exception as e: print(f"Unexpected error during Playwright installation: {e}") - print("Please run 'python -m playwright install' manually after the installation.") + print( + "Please run 'python -m playwright install' manually after the installation." + ) + def run_migration(): """Initialize database during installation""" try: print("Starting database initialization...") from crawl4ai.async_database import async_db_manager + asyncio.run(async_db_manager.initialize()) print("Database initialization completed successfully.") except ImportError: @@ -67,12 +80,14 @@ def run_migration(): print(f"Warning: Database initialization failed: {e}") print("Database will be initialized on first use") + class PostInstallCommand(install): def run(self): install.run(self) install_playwright() # run_migration() + setup( name="Crawl4AI", version=version, @@ -84,18 +99,23 @@ setup( author_email="unclecode@kidocode.com", license="MIT", packages=find_packages(), - install_requires=default_requirements + ["playwright", "aiofiles"], # Added aiofiles + install_requires=default_requirements + + ["playwright", "aiofiles"], # Added aiofiles extras_require={ "torch": torch_requirements, "transformer": transformer_requirements, "cosine": cosine_similarity_requirements, "sync": sync_requirements, - "all": default_requirements + torch_requirements + transformer_requirements + cosine_similarity_requirements + sync_requirements, + "all": default_requirements + + torch_requirements + + transformer_requirements + + cosine_similarity_requirements + + sync_requirements, }, entry_points={ - 'console_scripts': [ - 'crawl4ai-download-models=crawl4ai.model_loader:main', - 'crawl4ai-migrate=crawl4ai.migrations:main', # Added migration command + "console_scripts": [ + "crawl4ai-download-models=crawl4ai.model_loader:main", + "crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command ], }, classifiers=[ @@ -110,6 +130,6 @@ setup( ], python_requires=">=3.7", cmdclass={ - 'install': PostInstallCommand, + "install": PostInstallCommand, }, -) \ No newline at end of file +) From 1d83c493aff8672c9da471c222f60c5c72145b71 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 19:58:40 +0800 Subject: [PATCH 094/115] Enhance setup process and update contributors list - Acknowledge contributor paulokuong for fixing RAWL4_AI_BASE_DIRECTORY issue - Refine base directory handling in `setup.py` - Clarify Playwright installation instructions and improve error handling --- CONTRIBUTORS.md | 1 + setup.py | 48 ++++++++++++++++++++++++++++++++++-------------- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index deb46a9c..663e5541 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -21,6 +21,7 @@ We would like to thank the following people for their contributions to Crawl4AI: - [nelzomal](https://github.com/nelzomal) - Enhance development installation instructions [#286](https://github.com/unclecode/crawl4ai/pull/286) - [HamzaFarhan](https://github.com/HamzaFarhan) - Handled the cases where markdown_with_citations, references_markdown, and filtered_html might not be defined [#293](https://github.com/unclecode/crawl4ai/pull/293) - [NanmiCoder](https://github.com/NanmiCoder) - fix: crawler strategy exception handling and fixes [#271](https://github.com/unclecode/crawl4ai/pull/271) +- [paulokuong](https://github.com/paulokuong) - fix: RAWL4_AI_BASE_DIRECTORY should be Path object instead of string [#298](https://github.com/unclecode/crawl4ai/pull/298) ## Other Contributors diff --git a/setup.py b/setup.py index f5f3cf2d..dbb07410 100644 --- a/setup.py +++ b/setup.py @@ -9,10 +9,16 @@ import asyncio # Create the .crawl4ai folder in the user's home directory if it doesn't exist # If the folder already exists, remove the cache folder -crawl4ai_folder = os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()) / ".crawl4ai" +crawl4ai_folder = Path(os.getenv("CRAWL4_AI_BASE_DIRECTORY")) or Path.home() +crawl4ai_folder = crawl4ai_folder / ".crawl4ai" cache_folder = crawl4ai_folder / "cache" -content_folders = ['html_content', 'cleaned_html', 'markdown_content', - 'extracted_content', 'screenshots'] +content_folders = [ + "html_content", + "cleaned_html", + "markdown_content", + "extracted_content", + "screenshots", +] # Clean up old cache if exists if cache_folder.exists(): @@ -28,7 +34,7 @@ for folder in content_folders: __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) with open(os.path.join(__location__, "requirements.txt")) as f: requirements = f.read().splitlines() - + with open("crawl4ai/__version__.py") as f: for line in f: if line.startswith("__version__"): @@ -37,11 +43,12 @@ with open("crawl4ai/__version__.py") as f: # Define requirements default_requirements = requirements -torch_requirements = ["torch", "nltk", "scikit-learn"] +torch_requirements = ["torch", "nltk", "scikit-learn"] transformer_requirements = ["transformers", "tokenizers"] -cosine_similarity_requirements = ["torch", "transformers", "nltk" ] +cosine_similarity_requirements = ["torch", "transformers", "nltk"] sync_requirements = ["selenium"] + def install_playwright(): print("Installing Playwright browsers...") try: @@ -49,16 +56,22 @@ def install_playwright(): print("Playwright installation completed successfully.") except subprocess.CalledProcessError as e: print(f"Error during Playwright installation: {e}") - print("Please run 'python -m playwright install' manually after the installation.") + print( + "Please run 'python -m playwright install' manually after the installation." + ) except Exception as e: print(f"Unexpected error during Playwright installation: {e}") - print("Please run 'python -m playwright install' manually after the installation.") + print( + "Please run 'python -m playwright install' manually after the installation." + ) + def run_migration(): """Initialize database during installation""" try: print("Starting database initialization...") from crawl4ai.async_database import async_db_manager + asyncio.run(async_db_manager.initialize()) print("Database initialization completed successfully.") except ImportError: @@ -67,12 +80,14 @@ def run_migration(): print(f"Warning: Database initialization failed: {e}") print("Database will be initialized on first use") + class PostInstallCommand(install): def run(self): install.run(self) install_playwright() # run_migration() + setup( name="Crawl4AI", version=version, @@ -84,18 +99,23 @@ setup( author_email="unclecode@kidocode.com", license="MIT", packages=find_packages(), - install_requires=default_requirements + ["playwright", "aiofiles"], # Added aiofiles + install_requires=default_requirements + + ["playwright", "aiofiles"], # Added aiofiles extras_require={ "torch": torch_requirements, "transformer": transformer_requirements, "cosine": cosine_similarity_requirements, "sync": sync_requirements, - "all": default_requirements + torch_requirements + transformer_requirements + cosine_similarity_requirements + sync_requirements, + "all": default_requirements + + torch_requirements + + transformer_requirements + + cosine_similarity_requirements + + sync_requirements, }, entry_points={ - 'console_scripts': [ - 'crawl4ai-download-models=crawl4ai.model_loader:main', - 'crawl4ai-migrate=crawl4ai.migrations:main', # Added migration command + "console_scripts": [ + "crawl4ai-download-models=crawl4ai.model_loader:main", + "crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command ], }, classifiers=[ @@ -110,6 +130,6 @@ setup( ], python_requires=">=3.7", cmdclass={ - 'install': PostInstallCommand, + "install": PostInstallCommand, }, ) \ No newline at end of file From 652d396a818a01d9673920da8c1a2d166f0d23f1 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 20:00:29 +0800 Subject: [PATCH 095/115] chore: update version to 0.3.745 --- crawl4ai/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index e38cc61b..8b69d491 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.744" +__version__ = "0.3.745" From 7d81c17cca98b720d06743d6398d1184350ccc75 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 20:02:39 +0800 Subject: [PATCH 096/115] fix: improve handling of CRAWL4_AI_BASE_DIRECTORY environment variable in setup.py --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index dbb07410..d891ff9f 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,8 @@ import asyncio # Create the .crawl4ai folder in the user's home directory if it doesn't exist # If the folder already exists, remove the cache folder -crawl4ai_folder = Path(os.getenv("CRAWL4_AI_BASE_DIRECTORY")) or Path.home() +base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY") +crawl4ai_folder = Path(base_dir) if base_dir else Path.home() crawl4ai_folder = crawl4ai_folder / ".crawl4ai" cache_folder = crawl4ai_folder / "cache" content_folders = [ From c8485776fe2e475bbba1f8ee513679999283441c Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 20:04:16 +0800 Subject: [PATCH 097/115] docs: update README to reflect latest version v0.3.745 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 26cc9fcc..e8e6cddf 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -[✨ Check out what's new in the latest update!](#-recent-updates) +[✨ Check out latest update v0.3.745](#-recent-updates) ## 🧐 Why Crawl4AI? From c0e87abaee97e9e206eb787f8939fdf8790f4a2b Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 21:43:08 +0800 Subject: [PATCH 098/115] fix: update package versions in requirements.txt for compatibility --- requirements.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index ed259ac9..c0f6f183 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,16 @@ aiosqlite~=0.20 html2text~=2024.2 lxml~=5.3 -litellm~=1.48 +litellm>=1.53.1 numpy>=1.26.0,<3 pillow~=10.4 -playwright>=1.47,<1.48 +playwright>=1.49.0 python-dotenv~=1.0 requests~=2.26 beautifulsoup4~=4.12 -tf-playwright-stealth~=1.0 +tf-playwright-stealth>=1.1.0 xxhash~=3.4 rank-bm25~=0.2 -aiofiles~=24.0 +aiofiles>=24.1.0 colorama~=0.4 snowballstemmer~=2.2 \ No newline at end of file From b0419edda6c0a25da82f65f557beee4e0a3daf02 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 02:31:17 +0800 Subject: [PATCH 099/115] Update README.md (#300) --- README.md | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/README.md b/README.md index e8e6cddf..c9d92e17 100644 --- a/README.md +++ b/README.md @@ -125,34 +125,6 @@ if __name__ == "__main__": ✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/) -## Features ✨ - -- 🆓 Completely free and open-source -- 🚀 Blazing fast performance, outperforming many paid services -- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown) -- 🌐 Multi-browser support (Chromium, Firefox, WebKit) -- 🌍 Supports crawling multiple URLs simultaneously -- 🎨 Extracts and returns all media tags (Images, Audio, and Video) -- 🔗 Extracts all external and internal links -- 📚 Extracts metadata from the page -- 🔄 Custom hooks for authentication, headers, and page modifications -- 🕵️ User-agent customization -- 🖼️ Takes screenshots of pages with enhanced error handling -- 📜 Executes multiple custom JavaScripts before crawling -- 📊 Generates structured output without LLM using JsonCssExtractionStrategy -- 📚 Various chunking strategies: topic-based, regex, sentence, and more -- 🧠 Advanced extraction strategies: cosine clustering, LLM, and more -- 🎯 CSS selector support for precise data extraction -- 📝 Passes instructions/keywords to refine extraction -- 🔒 Proxy support with authentication for enhanced access -- 🔄 Session management for complex multi-page crawling -- 🌐 Asynchronous architecture for improved performance -- 🖼️ Improved image processing with lazy-loading detection -- 🕰️ Enhanced handling of delayed content loading -- 🔑 Custom headers support for LLM interactions -- 🖼️ iframe content extraction for comprehensive analysis -- ⏱️ Flexible timeout and delayed content retrieval options - ## Installation 🛠️ Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker. From 449dd7cc0b9d81e0f602b3868b478c8515a45bf1 Mon Sep 17 00:00:00 2001 From: unclecode Date: Fri, 29 Nov 2024 14:45:04 +0800 Subject: [PATCH 100/115] Migrating from the classic setup.py to a using PyProject approach. --- MANIFEST.in | 1 - build_hooks.py | 48 +++++++++++ docs/examples/quickstart_async.py | 128 +++++++++++++++++----------- plugin.py | 9 ++ post_install.py | 19 +++++ pyproject.toml | 75 ++++++++++++++++ requirements.txt | 16 ---- setup.cfg | 2 - setup.py | 136 ------------------------------ 9 files changed, 229 insertions(+), 205 deletions(-) delete mode 100644 MANIFEST.in create mode 100644 build_hooks.py create mode 100644 plugin.py create mode 100644 post_install.py create mode 100644 pyproject.toml delete mode 100644 requirements.txt delete mode 100644 setup.cfg delete mode 100644 setup.py diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 540b7204..00000000 --- a/MANIFEST.in +++ /dev/null @@ -1 +0,0 @@ -include requirements.txt \ No newline at end of file diff --git a/build_hooks.py b/build_hooks.py new file mode 100644 index 00000000..e59b5910 --- /dev/null +++ b/build_hooks.py @@ -0,0 +1,48 @@ +import os +import shutil +from pathlib import Path +import subprocess +import sys +from hatchling.builders.hooks.plugin.interface import BuildHookInterface +PLUGIN = "CustomBuildHook" + +class CustomBuildHook(BuildHookInterface): + def initialize(self, version, build_data): + # Create the .crawl4ai folder structure + base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY") + crawl4ai_folder = Path(base_dir) if base_dir else Path.home() + crawl4ai_folder = crawl4ai_folder / ".crawl4ai" + cache_folder = crawl4ai_folder / "cache" + content_folders = [ + "html_content", + "cleaned_html", + "markdown_content", + "extracted_content", + "screenshots", + ] + + # Clean up old cache if exists + if cache_folder.exists(): + shutil.rmtree(cache_folder) + + # Create new folder structure + crawl4ai_folder.mkdir(exist_ok=True) + cache_folder.mkdir(exist_ok=True) + for folder in content_folders: + (crawl4ai_folder / folder).mkdir(exist_ok=True) + + # Install Playwright browsers + try: + subprocess.check_call([sys.executable, "-m", "playwright", "install"]) + except Exception as e: + print(f"Warning: Playwright installation failed: {e}") + print("Please run 'python -m playwright install' manually after installation") + + # Initialize database + try: + from crawl4ai.async_database import async_db_manager + import asyncio + asyncio.run(async_db_manager.initialize()) + except Exception as e: + print(f"Warning: Database initialization failed: {e}") + print("Database will be initialized on first use") \ No newline at end of file diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 9f1eff53..01f7677c 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -32,7 +32,7 @@ print("Website: https://crawl4ai.com") async def simple_crawl(): print("\n--- Basic Usage ---") async with AsyncWebCrawler(verbose=True) as crawler: - result = await crawler.arun(url="https://www.nbcnews.com/business") + result = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode= CacheMode.BYPASS) print(result.markdown[:500]) # Print first 500 characters async def simple_example_with_running_js_code(): @@ -76,16 +76,17 @@ async def use_proxy(): async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler: result = await crawler.arun( url="https://www.nbcnews.com/business", - bypass_cache=True + cache_mode= CacheMode.BYPASS ) - print(result.markdown[:500]) # Print first 500 characters + if result.success: + print(result.markdown[:500]) # Print first 500 characters async def capture_and_save_screenshot(url: str, output_path: str): async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( url=url, screenshot=True, - bypass_cache=True + cache_mode= CacheMode.BYPASS ) if result.success and result.screenshot: @@ -141,41 +142,68 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None async def extract_structured_data_using_css_extractor(): print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") schema = { - "name": "Coinbase Crypto Prices", - "baseSelector": ".cds-tableRow-t45thuk", - "fields": [ - { - "name": "crypto", - "selector": "td:nth-child(1) h2", - "type": "text", - }, - { - "name": "symbol", - "selector": "td:nth-child(1) p", - "type": "text", - }, - { - "name": "price", - "selector": "td:nth-child(2)", - "type": "text", + "name": "KidoCode Courses", + "baseSelector": "section.charge-methodology .w-tab-content > div", + "fields": [ + { + "name": "section_title", + "selector": "h3.heading-50", + "type": "text", + }, + { + "name": "section_description", + "selector": ".charge-content", + "type": "text", + }, + { + "name": "course_name", + "selector": ".text-block-93", + "type": "text", + }, + { + "name": "course_description", + "selector": ".course-content-text", + "type": "text", + }, + { + "name": "course_icon", + "selector": ".image-92", + "type": "attribute", + "attribute": "src" + } + ] +} + + async with AsyncWebCrawler( + headless=True, + verbose=True + ) as crawler: + + # Create the JavaScript that handles clicking multiple times + js_click_tabs = """ + (async () => { + const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div"); + + for(let tab of tabs) { + // scroll to the tab + tab.scrollIntoView(); + tab.click(); + // Wait for content to load and animations to complete + await new Promise(r => setTimeout(r, 500)); } - ], - } + })(); + """ - extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) - - async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( - url="https://www.coinbase.com/explore", - extraction_strategy=extraction_strategy, - cache_mode=CacheMode.BYPASS, + url="https://www.kidocode.com/degrees/technology", + extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True), + js_code=[js_click_tabs], + cache_mode=CacheMode.BYPASS ) - assert result.success, "Failed to crawl the page" - - news_teasers = json.loads(result.extracted_content) - print(f"Successfully extracted {len(news_teasers)} news teasers") - print(json.dumps(news_teasers[0], indent=2)) + companies = json.loads(result.extracted_content) + print(f"Successfully extracted {len(companies)} companies") + print(json.dumps(companies[0], indent=2)) # Advanced Session-Based Crawling with Dynamic Content 🔄 async def crawl_dynamic_content_pages_method_1(): @@ -363,21 +391,21 @@ async def crawl_custom_browser_type(): # Use Firefox start = time.time() async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS) print(result.markdown[:500]) print("Time taken: ", time.time() - start) # Use WebKit start = time.time() async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS) print(result.markdown[:500]) print("Time taken: ", time.time() - start) # Use Chromium (default) start = time.time() async with AsyncWebCrawler(verbose=True, headless = True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS) print(result.markdown[:500]) print("Time taken: ", time.time() - start) @@ -534,29 +562,29 @@ async def fit_markdown_remove_overlay(): async def main(): - await simple_crawl() - await simple_example_with_running_js_code() - await simple_example_with_css_selector() - await use_proxy() - await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) - await extract_structured_data_using_css_extractor() + # await simple_crawl() + # await simple_example_with_running_js_code() + # await simple_example_with_css_selector() + # await use_proxy() + # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) + # await extract_structured_data_using_css_extractor() # LLM extraction examples # await extract_structured_data_using_llm() # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) # await extract_structured_data_using_llm("ollama/llama3.2") - await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) + # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) # You always can pass custom headers to the extraction strategy - custom_headers = { - "Authorization": "Bearer your-custom-token", - "X-Custom-Header": "Some-Value" - } - await extract_structured_data_using_llm(extra_headers=custom_headers) + # custom_headers = { + # "Authorization": "Bearer your-custom-token", + # "X-Custom-Header": "Some-Value" + # } + # await extract_structured_data_using_llm(extra_headers=custom_headers) # await crawl_dynamic_content_pages_method_1() # await crawl_dynamic_content_pages_method_2() - await crawl_dynamic_content_pages_method_3() + # await crawl_dynamic_content_pages_method_3() await crawl_custom_browser_type() diff --git a/plugin.py b/plugin.py new file mode 100644 index 00000000..1e1b11bf --- /dev/null +++ b/plugin.py @@ -0,0 +1,9 @@ +from colorama import Fore, Style +import subprocess +import sys + +def post_install(): + print(f"\n{Fore.YELLOW}{'='*40}") + print(f"{Fore.RED}IMPORTANT: Run this command now:") + print(f"{Fore.GREEN}python -m playwright install") + print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n") \ No newline at end of file diff --git a/post_install.py b/post_install.py new file mode 100644 index 00000000..e536e547 --- /dev/null +++ b/post_install.py @@ -0,0 +1,19 @@ +from colorama import Fore, Style +import subprocess +import sys +import distutils.log as log +from pathlib import Path + +def main(): + try: + subprocess.check_call([sys.executable, "-m", "playwright", "install"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL) + except: + print(f"\n{Fore.YELLOW}{'='*40}") + print(f"{Fore.RED}IMPORTANT: Run this command now:") + print(f"{Fore.GREEN}python -m playwright install") + print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..cfef8101 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,75 @@ +[build-system] +requires = ["hatchling", "hatch-fancy-pypi-readme>=22.5.0"] +build-backend = "hatchling.build" + +[project] +name = "Crawl4AI" +dynamic = ["version"] +description = "🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" +readme = "README.md" +license = "Apache-2.0" +requires-python = ">=3.7" +authors = [ + { name = "Unclecode", email = "unclecode@kidocode.com" }, +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", +] +dependencies = [ + "aiosqlite~=0.20", + "html2text~=2024.2", + "lxml~=5.3", + "litellm>=1.53.1", + "numpy>=1.26.0,<3", + "pillow~=10.4", + "playwright>=1.49.0", + "python-dotenv~=1.0", + "requests~=2.26", + "beautifulsoup4~=4.12", + "tf-playwright-stealth>=1.1.0", + "xxhash~=3.4", + "rank-bm25~=0.2", + "aiofiles>=24.1.0", + "colorama~=0.4", + "snowballstemmer~=2.2", +] + +[project.optional-dependencies] +torch = ["torch", "nltk", "scikit-learn"] +transformer = ["transformers", "tokenizers"] +cosine = ["torch", "transformers", "nltk"] +sync = ["selenium"] +all = [ + "torch", + "nltk", + "scikit-learn", + "transformers", + "tokenizers", + "selenium", +] + +[project.urls] +Homepage = "https://github.com/unclecode/crawl4ai" +Documentation = "https://crawl4ai.com/mkdocs/" + +[project.scripts] +crawl4ai-download-models = "crawl4ai.model_loader:main" +crawl4ai-migrate = "crawl4ai.migrations:main" +crawl4ai-post-install = "crawl4ai.post_install:main" + +[tool.hatch.version] +path = "crawl4ai/__version__.py" + +[tool.hatch.build.hooks.custom] +dependencies = ["hatch-fancy-pypi-readme>=22.5.0"] +path = "build_hooks.py" + +[project.entry-points.hatch] +crawl4ai = "crawl4ai.plugin:post_install" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index c0f6f183..00000000 --- a/requirements.txt +++ /dev/null @@ -1,16 +0,0 @@ -aiosqlite~=0.20 -html2text~=2024.2 -lxml~=5.3 -litellm>=1.53.1 -numpy>=1.26.0,<3 -pillow~=10.4 -playwright>=1.49.0 -python-dotenv~=1.0 -requests~=2.26 -beautifulsoup4~=4.12 -tf-playwright-stealth>=1.1.0 -xxhash~=3.4 -rank-bm25~=0.2 -aiofiles>=24.1.0 -colorama~=0.4 -snowballstemmer~=2.2 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 56490d6a..00000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[options] -include_package_data = True \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index d44169bf..00000000 --- a/setup.py +++ /dev/null @@ -1,136 +0,0 @@ -from setuptools import setup, find_packages -from setuptools.command.install import install -import os -from pathlib import Path -import shutil -import subprocess -import sys -import asyncio - -# Create the .crawl4ai folder in the user's home directory if it doesn't exist -# If the folder already exists, remove the cache folder -base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY") -crawl4ai_folder = Path(base_dir) if base_dir else Path.home() -crawl4ai_folder = crawl4ai_folder / ".crawl4ai" -cache_folder = crawl4ai_folder / "cache" -content_folders = [ - "html_content", - "cleaned_html", - "markdown_content", - "extracted_content", - "screenshots", -] - -# Clean up old cache if exists -if cache_folder.exists(): - shutil.rmtree(cache_folder) - -# Create new folder structure -crawl4ai_folder.mkdir(exist_ok=True) -cache_folder.mkdir(exist_ok=True) -for folder in content_folders: - (crawl4ai_folder / folder).mkdir(exist_ok=True) - -# Read requirements and version -__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) -with open(os.path.join(__location__, "requirements.txt")) as f: - requirements = f.read().splitlines() - -with open("crawl4ai/__version__.py") as f: - for line in f: - if line.startswith("__version__"): - version = line.split("=")[1].strip().strip('"') - break - -# Define requirements -default_requirements = requirements -torch_requirements = ["torch", "nltk", "scikit-learn"] -transformer_requirements = ["transformers", "tokenizers"] -cosine_similarity_requirements = ["torch", "transformers", "nltk"] -sync_requirements = ["selenium"] - - -def install_playwright(): - print("Installing Playwright browsers...") - try: - subprocess.check_call([sys.executable, "-m", "playwright", "install"]) - print("Playwright installation completed successfully.") - except subprocess.CalledProcessError as e: - print(f"Error during Playwright installation: {e}") - print( - "Please run 'python -m playwright install' manually after the installation." - ) - except Exception as e: - print(f"Unexpected error during Playwright installation: {e}") - print( - "Please run 'python -m playwright install' manually after the installation." - ) - - -def run_migration(): - """Initialize database during installation""" - try: - print("Starting database initialization...") - from crawl4ai.async_database import async_db_manager - - asyncio.run(async_db_manager.initialize()) - print("Database initialization completed successfully.") - except ImportError: - print("Warning: Database module not found. Will initialize on first use.") - except Exception as e: - print(f"Warning: Database initialization failed: {e}") - print("Database will be initialized on first use") - - -class PostInstallCommand(install): - def run(self): - install.run(self) - install_playwright() - # run_migration() - - -setup( - name="Crawl4AI", - version=version, - description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper", - long_description=open("README.md", encoding="utf-8").read(), - long_description_content_type="text/markdown", - url="https://github.com/unclecode/crawl4ai", - author="Unclecode", - author_email="unclecode@kidocode.com", - license="MIT", - packages=find_packages(), - install_requires=default_requirements - + ["playwright", "aiofiles"], # Added aiofiles - extras_require={ - "torch": torch_requirements, - "transformer": transformer_requirements, - "cosine": cosine_similarity_requirements, - "sync": sync_requirements, - "all": default_requirements - + torch_requirements - + transformer_requirements - + cosine_similarity_requirements - + sync_requirements, - }, - entry_points={ - "console_scripts": [ - "crawl4ai-download-models=crawl4ai.model_loader:main", - "crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command - ], - }, - classifiers=[ - "Development Status :: 3 - Alpha", - "Intended Audience :: Developers", - "License :: OSI Approved :: Apache Software License", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - ], - python_requires=">=3.7", - cmdclass={ - "install": PostInstallCommand, - }, -) From 12e73d489846dc83c29347bf84646ad8daef6cfc Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 16:01:19 +0800 Subject: [PATCH 101/115] refactor: remove legacy build hooks and setup files, migrate to setup.cfg and pyproject.toml --- MANIFEST.in | 1 + build_hooks.py | 48 ----------------- plugin.py | 9 ---- post_install.py | 19 ------- pyproject.toml | 75 -------------------------- requirements.txt | 16 ++++++ setup.cfg | 2 + setup.py | 136 +++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 155 insertions(+), 151 deletions(-) create mode 100644 MANIFEST.in delete mode 100644 build_hooks.py delete mode 100644 plugin.py delete mode 100644 post_install.py delete mode 100644 pyproject.toml create mode 100644 requirements.txt create mode 100644 setup.cfg create mode 100644 setup.py diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..540b7204 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include requirements.txt \ No newline at end of file diff --git a/build_hooks.py b/build_hooks.py deleted file mode 100644 index e59b5910..00000000 --- a/build_hooks.py +++ /dev/null @@ -1,48 +0,0 @@ -import os -import shutil -from pathlib import Path -import subprocess -import sys -from hatchling.builders.hooks.plugin.interface import BuildHookInterface -PLUGIN = "CustomBuildHook" - -class CustomBuildHook(BuildHookInterface): - def initialize(self, version, build_data): - # Create the .crawl4ai folder structure - base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY") - crawl4ai_folder = Path(base_dir) if base_dir else Path.home() - crawl4ai_folder = crawl4ai_folder / ".crawl4ai" - cache_folder = crawl4ai_folder / "cache" - content_folders = [ - "html_content", - "cleaned_html", - "markdown_content", - "extracted_content", - "screenshots", - ] - - # Clean up old cache if exists - if cache_folder.exists(): - shutil.rmtree(cache_folder) - - # Create new folder structure - crawl4ai_folder.mkdir(exist_ok=True) - cache_folder.mkdir(exist_ok=True) - for folder in content_folders: - (crawl4ai_folder / folder).mkdir(exist_ok=True) - - # Install Playwright browsers - try: - subprocess.check_call([sys.executable, "-m", "playwright", "install"]) - except Exception as e: - print(f"Warning: Playwright installation failed: {e}") - print("Please run 'python -m playwright install' manually after installation") - - # Initialize database - try: - from crawl4ai.async_database import async_db_manager - import asyncio - asyncio.run(async_db_manager.initialize()) - except Exception as e: - print(f"Warning: Database initialization failed: {e}") - print("Database will be initialized on first use") \ No newline at end of file diff --git a/plugin.py b/plugin.py deleted file mode 100644 index 1e1b11bf..00000000 --- a/plugin.py +++ /dev/null @@ -1,9 +0,0 @@ -from colorama import Fore, Style -import subprocess -import sys - -def post_install(): - print(f"\n{Fore.YELLOW}{'='*40}") - print(f"{Fore.RED}IMPORTANT: Run this command now:") - print(f"{Fore.GREEN}python -m playwright install") - print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n") \ No newline at end of file diff --git a/post_install.py b/post_install.py deleted file mode 100644 index e536e547..00000000 --- a/post_install.py +++ /dev/null @@ -1,19 +0,0 @@ -from colorama import Fore, Style -import subprocess -import sys -import distutils.log as log -from pathlib import Path - -def main(): - try: - subprocess.check_call([sys.executable, "-m", "playwright", "install"], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL) - except: - print(f"\n{Fore.YELLOW}{'='*40}") - print(f"{Fore.RED}IMPORTANT: Run this command now:") - print(f"{Fore.GREEN}python -m playwright install") - print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n") - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index cfef8101..00000000 --- a/pyproject.toml +++ /dev/null @@ -1,75 +0,0 @@ -[build-system] -requires = ["hatchling", "hatch-fancy-pypi-readme>=22.5.0"] -build-backend = "hatchling.build" - -[project] -name = "Crawl4AI" -dynamic = ["version"] -description = "🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" -readme = "README.md" -license = "Apache-2.0" -requires-python = ">=3.7" -authors = [ - { name = "Unclecode", email = "unclecode@kidocode.com" }, -] -classifiers = [ - "Development Status :: 3 - Alpha", - "Intended Audience :: Developers", - "License :: OSI Approved :: Apache Software License", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", -] -dependencies = [ - "aiosqlite~=0.20", - "html2text~=2024.2", - "lxml~=5.3", - "litellm>=1.53.1", - "numpy>=1.26.0,<3", - "pillow~=10.4", - "playwright>=1.49.0", - "python-dotenv~=1.0", - "requests~=2.26", - "beautifulsoup4~=4.12", - "tf-playwright-stealth>=1.1.0", - "xxhash~=3.4", - "rank-bm25~=0.2", - "aiofiles>=24.1.0", - "colorama~=0.4", - "snowballstemmer~=2.2", -] - -[project.optional-dependencies] -torch = ["torch", "nltk", "scikit-learn"] -transformer = ["transformers", "tokenizers"] -cosine = ["torch", "transformers", "nltk"] -sync = ["selenium"] -all = [ - "torch", - "nltk", - "scikit-learn", - "transformers", - "tokenizers", - "selenium", -] - -[project.urls] -Homepage = "https://github.com/unclecode/crawl4ai" -Documentation = "https://crawl4ai.com/mkdocs/" - -[project.scripts] -crawl4ai-download-models = "crawl4ai.model_loader:main" -crawl4ai-migrate = "crawl4ai.migrations:main" -crawl4ai-post-install = "crawl4ai.post_install:main" - -[tool.hatch.version] -path = "crawl4ai/__version__.py" - -[tool.hatch.build.hooks.custom] -dependencies = ["hatch-fancy-pypi-readme>=22.5.0"] -path = "build_hooks.py" - -[project.entry-points.hatch] -crawl4ai = "crawl4ai.plugin:post_install" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..c0f6f183 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,16 @@ +aiosqlite~=0.20 +html2text~=2024.2 +lxml~=5.3 +litellm>=1.53.1 +numpy>=1.26.0,<3 +pillow~=10.4 +playwright>=1.49.0 +python-dotenv~=1.0 +requests~=2.26 +beautifulsoup4~=4.12 +tf-playwright-stealth>=1.1.0 +xxhash~=3.4 +rank-bm25~=0.2 +aiofiles>=24.1.0 +colorama~=0.4 +snowballstemmer~=2.2 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..56490d6a --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[options] +include_package_data = True \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..d44169bf --- /dev/null +++ b/setup.py @@ -0,0 +1,136 @@ +from setuptools import setup, find_packages +from setuptools.command.install import install +import os +from pathlib import Path +import shutil +import subprocess +import sys +import asyncio + +# Create the .crawl4ai folder in the user's home directory if it doesn't exist +# If the folder already exists, remove the cache folder +base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY") +crawl4ai_folder = Path(base_dir) if base_dir else Path.home() +crawl4ai_folder = crawl4ai_folder / ".crawl4ai" +cache_folder = crawl4ai_folder / "cache" +content_folders = [ + "html_content", + "cleaned_html", + "markdown_content", + "extracted_content", + "screenshots", +] + +# Clean up old cache if exists +if cache_folder.exists(): + shutil.rmtree(cache_folder) + +# Create new folder structure +crawl4ai_folder.mkdir(exist_ok=True) +cache_folder.mkdir(exist_ok=True) +for folder in content_folders: + (crawl4ai_folder / folder).mkdir(exist_ok=True) + +# Read requirements and version +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) +with open(os.path.join(__location__, "requirements.txt")) as f: + requirements = f.read().splitlines() + +with open("crawl4ai/__version__.py") as f: + for line in f: + if line.startswith("__version__"): + version = line.split("=")[1].strip().strip('"') + break + +# Define requirements +default_requirements = requirements +torch_requirements = ["torch", "nltk", "scikit-learn"] +transformer_requirements = ["transformers", "tokenizers"] +cosine_similarity_requirements = ["torch", "transformers", "nltk"] +sync_requirements = ["selenium"] + + +def install_playwright(): + print("Installing Playwright browsers...") + try: + subprocess.check_call([sys.executable, "-m", "playwright", "install"]) + print("Playwright installation completed successfully.") + except subprocess.CalledProcessError as e: + print(f"Error during Playwright installation: {e}") + print( + "Please run 'python -m playwright install' manually after the installation." + ) + except Exception as e: + print(f"Unexpected error during Playwright installation: {e}") + print( + "Please run 'python -m playwright install' manually after the installation." + ) + + +def run_migration(): + """Initialize database during installation""" + try: + print("Starting database initialization...") + from crawl4ai.async_database import async_db_manager + + asyncio.run(async_db_manager.initialize()) + print("Database initialization completed successfully.") + except ImportError: + print("Warning: Database module not found. Will initialize on first use.") + except Exception as e: + print(f"Warning: Database initialization failed: {e}") + print("Database will be initialized on first use") + + +class PostInstallCommand(install): + def run(self): + install.run(self) + install_playwright() + # run_migration() + + +setup( + name="Crawl4AI", + version=version, + description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper", + long_description=open("README.md", encoding="utf-8").read(), + long_description_content_type="text/markdown", + url="https://github.com/unclecode/crawl4ai", + author="Unclecode", + author_email="unclecode@kidocode.com", + license="MIT", + packages=find_packages(), + install_requires=default_requirements + + ["playwright", "aiofiles"], # Added aiofiles + extras_require={ + "torch": torch_requirements, + "transformer": transformer_requirements, + "cosine": cosine_similarity_requirements, + "sync": sync_requirements, + "all": default_requirements + + torch_requirements + + transformer_requirements + + cosine_similarity_requirements + + sync_requirements, + }, + entry_points={ + "console_scripts": [ + "crawl4ai-download-models=crawl4ai.model_loader:main", + "crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command + ], + }, + classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + ], + python_requires=">=3.7", + cmdclass={ + "install": PostInstallCommand, + }, +) From d202f3539bf7447f7594f7f1897c3062c337ae52 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 18:48:44 +0800 Subject: [PATCH 102/115] Enhance installation and migration processes - Added a post-installation setup script for initialization. - Updated README with installation notes for Playwright setup. - Enhanced migration logging for better error visibility. - Added 'pydantic' to requirements. - Bumped version to 0.3.746. --- README.md | 32 ++----------------- crawl4ai/__init__.py | 1 - crawl4ai/__version__.py | 2 +- crawl4ai/install.py | 44 ++++++++++++++++++++++++++ crawl4ai/migrations.py | 40 ++++++++++++++++-------- docs/examples/quickstart_async.py | 18 +++++------ requirements.txt | 4 +-- setup.py | 51 ++----------------------------- 8 files changed, 90 insertions(+), 102 deletions(-) create mode 100644 crawl4ai/install.py diff --git a/README.md b/README.md index e8e6cddf..bbfa5858 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant 1. Install Crawl4AI: ```bash pip install crawl4ai +crawl4ai-setup # Setup the browser ``` 2. Run a simple web crawl: @@ -125,34 +126,6 @@ if __name__ == "__main__": ✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/) -## Features ✨ - -- 🆓 Completely free and open-source -- 🚀 Blazing fast performance, outperforming many paid services -- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown) -- 🌐 Multi-browser support (Chromium, Firefox, WebKit) -- 🌍 Supports crawling multiple URLs simultaneously -- 🎨 Extracts and returns all media tags (Images, Audio, and Video) -- 🔗 Extracts all external and internal links -- 📚 Extracts metadata from the page -- 🔄 Custom hooks for authentication, headers, and page modifications -- 🕵️ User-agent customization -- 🖼️ Takes screenshots of pages with enhanced error handling -- 📜 Executes multiple custom JavaScripts before crawling -- 📊 Generates structured output without LLM using JsonCssExtractionStrategy -- 📚 Various chunking strategies: topic-based, regex, sentence, and more -- 🧠 Advanced extraction strategies: cosine clustering, LLM, and more -- 🎯 CSS selector support for precise data extraction -- 📝 Passes instructions/keywords to refine extraction -- 🔒 Proxy support with authentication for enhanced access -- 🔄 Session management for complex multi-page crawling -- 🌐 Asynchronous architecture for improved performance -- 🖼️ Improved image processing with lazy-loading detection -- 🕰️ Enhanced handling of delayed content loading -- 🔑 Custom headers support for LLM interactions -- 🖼️ iframe content extraction for comprehensive analysis -- ⏱️ Flexible timeout and delayed content retrieval options - ## Installation 🛠️ Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker. @@ -168,11 +141,12 @@ For basic web crawling and scraping tasks: ```bash pip install crawl4ai +crawl4ai-setup # Setup the browser ``` By default, this will install the asynchronous version of Crawl4AI, using Playwright for web crawling. -👉 **Note**: When you install Crawl4AI, the setup script should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods: +👉 **Note**: When you install Crawl4AI, the `crawl4ai-setup` should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods: 1. Through the command line: diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 0ccf13d8..cee7c25b 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -4,7 +4,6 @@ from .async_webcrawler import AsyncWebCrawler, CacheMode from .models import CrawlResult from .__version__ import __version__ -# __version__ = "0.3.73" __all__ = [ "AsyncWebCrawler", diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 8b69d491..4a938b75 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.745" +__version__ = "0.3.746" diff --git a/crawl4ai/install.py b/crawl4ai/install.py new file mode 100644 index 00000000..71fe30ea --- /dev/null +++ b/crawl4ai/install.py @@ -0,0 +1,44 @@ +import subprocess +import sys +import asyncio +from .async_logger import AsyncLogger, LogLevel + +# Initialize logger +logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True) + +def post_install(): + """Run all post-installation tasks""" + logger.info("Running post-installation setup...", tag="INIT") + install_playwright() + run_migration() + logger.success("Post-installation setup completed!", tag="COMPLETE") + +def install_playwright(): + logger.info("Installing Playwright browsers...", tag="INIT") + try: + subprocess.check_call([sys.executable, "-m", "playwright", "install"]) + logger.success("Playwright installation completed successfully.", tag="COMPLETE") + except subprocess.CalledProcessError as e: + logger.error(f"Error during Playwright installation: {e}", tag="ERROR") + logger.warning( + "Please run 'python -m playwright install' manually after the installation." + ) + except Exception as e: + logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR") + logger.warning( + "Please run 'python -m playwright install' manually after the installation." + ) + +def run_migration(): + """Initialize database during installation""" + try: + logger.info("Starting database initialization...", tag="INIT") + from crawl4ai.async_database import async_db_manager + + asyncio.run(async_db_manager.initialize()) + logger.success("Database initialization completed successfully.", tag="COMPLETE") + except ImportError: + logger.warning("Database module not found. Will initialize on first use.") + except Exception as e: + logger.warning(f"Database initialization failed: {e}") + logger.warning("Database will be initialized on first use") \ No newline at end of file diff --git a/crawl4ai/migrations.py b/crawl4ai/migrations.py index 77616086..3386b0fb 100644 --- a/crawl4ai/migrations.py +++ b/crawl4ai/migrations.py @@ -9,9 +9,13 @@ import aiofiles import shutil import time from datetime import datetime +from .async_logger import AsyncLogger, LogLevel -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) +# Initialize logger +logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True) + +# logging.basicConfig(level=logging.INFO) +# logger = logging.getLogger(__name__) class DatabaseMigration: def __init__(self, db_path: str): @@ -55,7 +59,8 @@ class DatabaseMigration: async def migrate_database(self): """Migrate existing database to file-based storage""" - logger.info("Starting database migration...") + # logger.info("Starting database migration...") + logger.info("Starting database migration...", tag="INIT") try: async with aiosqlite.connect(self.db_path) as db: @@ -91,19 +96,25 @@ class DatabaseMigration: migrated_count += 1 if migrated_count % 100 == 0: - logger.info(f"Migrated {migrated_count} records...") + logger.info(f"Migrated {migrated_count} records...", tag="INIT") + await db.commit() - logger.info(f"Migration completed. {migrated_count} records processed.") + logger.success(f"Migration completed. {migrated_count} records processed.", tag="COMPLETE") except Exception as e: - logger.error(f"Migration failed: {e}") - raise + # logger.error(f"Migration failed: {e}") + logger.error( + message="Migration failed: {error}", + tag="ERROR", + params={"error": str(e)} + ) + raise e async def backup_database(db_path: str) -> str: """Create backup of existing database""" if not os.path.exists(db_path): - logger.info("No existing database found. Skipping backup.") + logger.info("No existing database found. Skipping backup.", tag="INIT") return None # Create backup with timestamp @@ -116,11 +127,16 @@ async def backup_database(db_path: str) -> str: # Create backup shutil.copy2(db_path, backup_path) - logger.info(f"Database backup created at: {backup_path}") + logger.info(f"Database backup created at: {backup_path}", tag="COMPLETE") return backup_path except Exception as e: - logger.error(f"Backup failed: {e}") - raise + # logger.error(f"Backup failed: {e}") + logger.error( + message="Migration failed: {error}", + tag="ERROR", + params={"error": str(e)} + ) + raise e async def run_migration(db_path: Optional[str] = None): """Run database migration""" @@ -128,7 +144,7 @@ async def run_migration(db_path: Optional[str] = None): db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db") if not os.path.exists(db_path): - logger.info("No existing database found. Skipping migration.") + logger.info("No existing database found. Skipping migration.", tag="INIT") return # Create backup first diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 01f7677c..679a9bc2 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -562,18 +562,18 @@ async def fit_markdown_remove_overlay(): async def main(): - # await simple_crawl() - # await simple_example_with_running_js_code() - # await simple_example_with_css_selector() + await simple_crawl() + await simple_example_with_running_js_code() + await simple_example_with_css_selector() # await use_proxy() - # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) - # await extract_structured_data_using_css_extractor() + await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) + await extract_structured_data_using_css_extractor() # LLM extraction examples # await extract_structured_data_using_llm() # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) # await extract_structured_data_using_llm("ollama/llama3.2") - # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) + await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) # You always can pass custom headers to the extraction strategy # custom_headers = { @@ -582,9 +582,9 @@ async def main(): # } # await extract_structured_data_using_llm(extra_headers=custom_headers) - # await crawl_dynamic_content_pages_method_1() - # await crawl_dynamic_content_pages_method_2() - # await crawl_dynamic_content_pages_method_3() + await crawl_dynamic_content_pages_method_1() + await crawl_dynamic_content_pages_method_2() + await crawl_dynamic_content_pages_method_3() await crawl_custom_browser_type() diff --git a/requirements.txt b/requirements.txt index c0f6f183..741e12ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ aiosqlite~=0.20 -html2text~=2024.2 lxml~=5.3 litellm>=1.53.1 numpy>=1.26.0,<3 @@ -13,4 +12,5 @@ xxhash~=3.4 rank-bm25~=0.2 aiofiles>=24.1.0 colorama~=0.4 -snowballstemmer~=2.2 \ No newline at end of file +snowballstemmer~=2.2 +pydantic>=2.10 \ No newline at end of file diff --git a/setup.py b/setup.py index d44169bf..e6840cd0 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,8 @@ from setuptools import setup, find_packages -from setuptools.command.install import install import os from pathlib import Path import shutil -import subprocess -import sys -import asyncio + # Create the .crawl4ai folder in the user's home directory if it doesn't exist # If the folder already exists, remove the cache folder @@ -49,46 +46,6 @@ transformer_requirements = ["transformers", "tokenizers"] cosine_similarity_requirements = ["torch", "transformers", "nltk"] sync_requirements = ["selenium"] - -def install_playwright(): - print("Installing Playwright browsers...") - try: - subprocess.check_call([sys.executable, "-m", "playwright", "install"]) - print("Playwright installation completed successfully.") - except subprocess.CalledProcessError as e: - print(f"Error during Playwright installation: {e}") - print( - "Please run 'python -m playwright install' manually after the installation." - ) - except Exception as e: - print(f"Unexpected error during Playwright installation: {e}") - print( - "Please run 'python -m playwright install' manually after the installation." - ) - - -def run_migration(): - """Initialize database during installation""" - try: - print("Starting database initialization...") - from crawl4ai.async_database import async_db_manager - - asyncio.run(async_db_manager.initialize()) - print("Database initialization completed successfully.") - except ImportError: - print("Warning: Database module not found. Will initialize on first use.") - except Exception as e: - print(f"Warning: Database initialization failed: {e}") - print("Database will be initialized on first use") - - -class PostInstallCommand(install): - def run(self): - install.run(self) - install_playwright() - # run_migration() - - setup( name="Crawl4AI", version=version, @@ -116,7 +73,8 @@ setup( entry_points={ "console_scripts": [ "crawl4ai-download-models=crawl4ai.model_loader:main", - "crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command + "crawl4ai-migrate=crawl4ai.migrations:main", + 'crawl4ai-setup=crawl4ai.install:post_install', ], }, classifiers=[ @@ -130,7 +88,4 @@ setup( "Programming Language :: Python :: 3.10", ], python_requires=">=3.7", - cmdclass={ - "install": PostInstallCommand, - }, ) From 93bf3e8a1f87760e04d6a18b2e27bae0f5d5da0e Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 20:08:09 +0800 Subject: [PATCH 103/115] Refactor Dockerfile and clean up main.py - Enhanced Dockerfile for platform-specific installations - Added ARG for TARGETPLATFORM and BUILDPLATFORM - Improved GPU support conditional on TARGETPLATFORM - Removed static pages mounting in main.py - Streamlined code structure to improve maintainability --- Dockerfile | 25 ++++++++++++++++--------- main.py | 4 ---- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/Dockerfile b/Dockerfile index bd71deae..2997590a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,9 @@ # syntax=docker/dockerfile:1.4 -# Build arguments +ARG TARGETPLATFORM +ARG BUILDPLATFORM + +# Other build arguments ARG PYTHON_VERSION=3.10 # Base stage with system dependencies @@ -63,13 +66,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ && rm -rf /var/lib/apt/lists/* # GPU support if enabled and architecture is supported -RUN if [ "$ENABLE_GPU" = "true" ] && [ "$(dpkg --print-architecture)" != "arm64" ] ; then \ - apt-get update && apt-get install -y --no-install-recommends \ - nvidia-cuda-toolkit \ - && rm -rf /var/lib/apt/lists/* ; \ - else \ - echo "Skipping NVIDIA CUDA Toolkit installation (unsupported architecture or GPU disabled)"; \ - fi +RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETPLATFORM" = "linux/amd64" ] ; then \ + apt-get update && apt-get install -y --no-install-recommends \ + nvidia-cuda-toolkit \ + && rm -rf /var/lib/apt/lists/* ; \ +else \ + echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \ +fi # Create and set working directory WORKDIR /app @@ -120,7 +123,11 @@ RUN pip install --no-cache-dir \ RUN mkdocs build # Install Playwright and browsers -RUN playwright install +RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \ + playwright install chromium; \ + elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + playwright install chromium; \ + fi # Expose port EXPOSE 8000 11235 9222 8080 diff --git a/main.py b/main.py index 6d217410..d6c792e8 100644 --- a/main.py +++ b/main.py @@ -340,9 +340,6 @@ app.add_middleware( allow_headers=["*"], # Allows all headers ) -# Mount the pages directory as a static directory -app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages") - # API token security security = HTTPBearer() CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code" @@ -364,7 +361,6 @@ if os.path.exists(__location__ + "/site"): app.mount("/mkdocs", StaticFiles(directory="site", html=True), name="mkdocs") site_templates = Jinja2Templates(directory=__location__ + "/site") -templates = Jinja2Templates(directory=__location__ + "/pages") crawler_service = CrawlerService() From f9c98a377dd1dda28f88cd5ab4e801535a88abcc Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 20:52:51 +0800 Subject: [PATCH 104/115] Enhance Docker support and improve installation process - Added new Docker commands for platform-specific builds. - Updated README with comprehensive installation and setup instructions. - Introduced `post_install` method in setup script for automation. - Refined migration processes with enhanced error logging. - Bump version to 0.3.746 and updated dependencies. --- CHANGELOG.md | 59 +++++++++++ README.md | 177 +++++++++++++++++++++++++++----- docker-compose.yml | 65 ++++++------ docs/examples/docker_example.py | 22 ++-- 4 files changed, 256 insertions(+), 67 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ec79639..309218dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,64 @@ # Changelog +## [0.3.746] November 29, 2024 + +### Major Features +1. Enhanced Docker Support (Nov 29, 2024) + - Improved GPU support in Docker images. + - Dockerfile refactored for better platform-specific installations. + - Introduced new Docker commands for different platforms: + - `basic-amd64`, `all-amd64`, `gpu-amd64` for AMD64. + - `basic-arm64`, `all-arm64`, `gpu-arm64` for ARM64. + +### Infrastructure & Documentation +- Enhanced README.md to improve user guidance and installation instructions. +- Added installation instructions for Playwright setup in README. +- Created and updated examples in `docs/examples/quickstart_async.py` to be more useful and user-friendly. +- Updated `requirements.txt` with a new `pydantic` dependency. +- Bumped version number in `crawl4ai/__version__.py` to 0.3.746. + +### Breaking Changes +- Streamlined application structure: + - Removed static pages and related code from `main.py` which might affect existing deployments relying on static content. + +### Development Updates +- Developed `post_install` method in `crawl4ai/install.py` to streamline post-installation setup tasks. +- Refined migration processes in `crawl4ai/migrations.py` with enhanced logging for better error visibility. +- Updated `docker-compose.yml` to support local and hub services for different architectures, enhancing build and deploy capabilities. +- Refactored example test cases in `docs/examples/docker_example.py` to facilitate comprehensive testing. + +### README.md +Updated README with new docker commands and setup instructions. +Enhanced installation instructions and guidance. + +### crawl4ai/install.py +Added post-install script functionality. +Introduced `post_install` method for automation of post-installation tasks. + +### crawl4ai/migrations.py +Improved migration logging. +Refined migration processes and added better logging. + +### docker-compose.yml +Refactored docker-compose for better service management. +Updated to define services for different platforms and versions. + +### requirements.txt +Updated dependencies. +Added `pydantic` to requirements file. + +### crawler/__version__.py +Updated version number. +Bumped version number to 0.3.746. + +### docs/examples/quickstart_async.py +Enhanced example scripts. +Uncommented example usage in async guide for user functionality. + +### main.py +Refactored code to improve maintainability. +Streamlined app structure by removing static pages code. + ## [0.3.743] November 27, 2024 Enhance features and documentation diff --git a/README.md b/README.md index bbfa5858..3d89ee19 100644 --- a/README.md +++ b/README.md @@ -220,48 +220,173 @@ Crawl4AI is available as Docker images for easy deployment. You can either pull --- -### Option 1: Docker Hub (Recommended) +
    +🐳 Option 1: Docker Hub (Recommended) +Choose the appropriate image based on your platform and needs: + +### For AMD64 (Regular Linux/Windows): ```bash -# Pull and run from Docker Hub (choose one): -docker pull unclecode/crawl4ai:basic # Basic crawling features -docker pull unclecode/crawl4ai:all # Full installation (ML, LLM support) -docker pull unclecode/crawl4ai:gpu # GPU-enabled version +# Basic version (recommended) +docker pull unclecode/crawl4ai:basic-amd64 +docker run -p 11235:11235 unclecode/crawl4ai:basic-amd64 -# Run the container -docker run -p 11235:11235 unclecode/crawl4ai:basic # Replace 'basic' with your chosen version +# Full ML/LLM support +docker pull unclecode/crawl4ai:all-amd64 +docker run -p 11235:11235 unclecode/crawl4ai:all-amd64 -# In case you want to set platform to arm64 -docker run --platform linux/arm64 -p 11235:11235 unclecode/crawl4ai:basic - -# In case to allocate more shared memory for the container -docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic +# With GPU support +docker pull unclecode/crawl4ai:gpu-amd64 +docker run -p 11235:11235 unclecode/crawl4ai:gpu-amd64 ``` ---- +### For ARM64 (M1/M2 Macs, ARM servers): +```bash +# Basic version (recommended) +docker pull unclecode/crawl4ai:basic-arm64 +docker run -p 11235:11235 unclecode/crawl4ai:basic-arm64 -### Option 2: Build from Repository +# Full ML/LLM support +docker pull unclecode/crawl4ai:all-arm64 +docker run -p 11235:11235 unclecode/crawl4ai:all-arm64 + +# With GPU support +docker pull unclecode/crawl4ai:gpu-arm64 +docker run -p 11235:11235 unclecode/crawl4ai:gpu-arm64 +``` + +Need more memory? Add `--shm-size`: +```bash +docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic-amd64 +``` + +Test the installation: +```bash +curl http://localhost:11235/health +``` + +### For Raspberry Pi (32-bit) (Experimental) +```bash +# Pull and run basic version (recommended for Raspberry Pi) +docker pull unclecode/crawl4ai:basic-armv7 +docker run -p 11235:11235 unclecode/crawl4ai:basic-armv7 + +# With increased shared memory if needed +docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic-armv7 +``` + +Note: Due to hardware constraints, only the basic version is recommended for Raspberry Pi. + +
    + +
    +🐳 Option 2: Build from Repository + +Build the image locally based on your platform: ```bash # Clone the repository git clone https://github.com/unclecode/crawl4ai.git cd crawl4ai -# Build the image -docker build -t crawl4ai:local \ - --build-arg INSTALL_TYPE=basic \ # Options: basic, all +# For AMD64 (Regular Linux/Windows) +docker build --platform linux/amd64 \ + --tag crawl4ai:local \ + --build-arg INSTALL_TYPE=basic \ . -# In case you want to set platform to arm64 -docker build -t crawl4ai:local \ - --build-arg INSTALL_TYPE=basic \ # Options: basic, all - --platform linux/arm64 \ +# For ARM64 (M1/M2 Macs, ARM servers) +docker build --platform linux/arm64 \ + --tag crawl4ai:local \ + --build-arg INSTALL_TYPE=basic \ . - -# Run your local build -docker run -p 11235:11235 crawl4ai:local ``` +Build options: +- INSTALL_TYPE=basic (default): Basic crawling features +- INSTALL_TYPE=all: Full ML/LLM support +- ENABLE_GPU=true: Add GPU support + +Example with all options: +```bash +docker build --platform linux/amd64 \ + --tag crawl4ai:local \ + --build-arg INSTALL_TYPE=all \ + --build-arg ENABLE_GPU=true \ + . +``` + +Run your local build: +```bash +# Regular run +docker run -p 11235:11235 crawl4ai:local + +# With increased shared memory +docker run --shm-size=2gb -p 11235:11235 crawl4ai:local +``` + +Test the installation: +```bash +curl http://localhost:11235/health +``` + +
    + +
    +🐳 Option 3: Using Docker Compose + +Docker Compose provides a more structured way to run Crawl4AI, especially when dealing with environment variables and multiple configurations. + +```bash +# Clone the repository +git clone https://github.com/unclecode/crawl4ai.git +cd crawl4ai +``` + +### For AMD64 (Regular Linux/Windows): +```bash +# Build and run locally +docker-compose --profile local-amd64 up + +# Run from Docker Hub +VERSION=basic docker-compose --profile hub-amd64 up # Basic version +VERSION=all docker-compose --profile hub-amd64 up # Full ML/LLM support +VERSION=gpu docker-compose --profile hub-amd64 up # GPU support +``` + +### For ARM64 (M1/M2 Macs, ARM servers): +```bash +# Build and run locally +docker-compose --profile local-arm64 up + +# Run from Docker Hub +VERSION=basic docker-compose --profile hub-arm64 up # Basic version +VERSION=all docker-compose --profile hub-arm64 up # Full ML/LLM support +VERSION=gpu docker-compose --profile hub-arm64 up # GPU support +``` + +Environment variables (optional): +```bash +# Create a .env file +CRAWL4AI_API_TOKEN=your_token +OPENAI_API_KEY=your_openai_key +CLAUDE_API_KEY=your_claude_key +``` + +The compose file includes: +- Memory management (4GB limit, 1GB reserved) +- Shared memory volume for browser support +- Health checks +- Auto-restart policy +- All necessary port mappings + +Test the installation: +```bash +curl http://localhost:11235/health +``` + +
    + --- ### Quick Test @@ -278,11 +403,11 @@ response = requests.post( ) task_id = response.json()["task_id"] -# Get results +# Continue polling until the task is complete (status="completed") result = requests.get(f"http://localhost:11235/task/{task_id}") ``` -For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://crawl4ai.com/mkdocs/basic/docker-deployment/). +For more examples, see our [Docker Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py). For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://crawl4ai.com/mkdocs/basic/docker-deployment/).
    diff --git a/docker-compose.yml b/docker-compose.yml index b93beda9..4b22fd98 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,6 @@ services: - crawl4ai: + # Local build services for different platforms + crawl4ai-amd64: build: context: . dockerfile: Dockerfile @@ -7,35 +8,39 @@ services: PYTHON_VERSION: "3.10" INSTALL_TYPE: ${INSTALL_TYPE:-basic} ENABLE_GPU: false - profiles: ["local"] - ports: - - "11235:11235" - - "8000:8000" - - "9222:9222" - - "8080:8080" - environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} - - OPENAI_API_KEY=${OPENAI_API_KEY:-} - - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} - volumes: - - /dev/shm:/dev/shm - deploy: - resources: - limits: - memory: 4G - reservations: - memory: 1G - restart: unless-stopped - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:11235/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s + platforms: + - linux/amd64 + profiles: ["local-amd64"] + extends: &base-config + file: docker-compose.yml + service: base-config - crawl4ai-hub: - image: unclecode/crawl4ai:basic - profiles: ["hub"] + crawl4ai-arm64: + build: + context: . + dockerfile: Dockerfile + args: + PYTHON_VERSION: "3.10" + INSTALL_TYPE: ${INSTALL_TYPE:-basic} + ENABLE_GPU: false + platforms: + - linux/arm64 + profiles: ["local-arm64"] + extends: *base-config + + # Hub services for different platforms and versions + crawl4ai-hub-amd64: + image: unclecode/crawl4ai:${VERSION:-basic}-amd64 + profiles: ["hub-amd64"] + extends: *base-config + + crawl4ai-hub-arm64: + image: unclecode/crawl4ai:${VERSION:-basic}-arm64 + profiles: ["hub-arm64"] + extends: *base-config + + # Base configuration to be extended + base-config: ports: - "11235:11235" - "8000:8000" @@ -59,4 +64,4 @@ services: interval: 30s timeout: 10s retries: 3 - start_period: 40s + start_period: 40s \ No newline at end of file diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py index 17ef9f04..48acc809 100644 --- a/docs/examples/docker_example.py +++ b/docs/examples/docker_example.py @@ -78,20 +78,20 @@ def test_docker_deployment(version="basic"): time.sleep(5) # Test cases based on version - # test_basic_crawl(tester) - # test_basic_crawl(tester) - # test_basic_crawl_sync(tester) test_basic_crawl_direct(tester) + test_basic_crawl(tester) + test_basic_crawl(tester) + test_basic_crawl_sync(tester) - # if version in ["full", "transformer"]: - # test_cosine_extraction(tester) + if version in ["full", "transformer"]: + test_cosine_extraction(tester) - # test_js_execution(tester) - # test_css_selector(tester) - # test_structured_extraction(tester) - # test_llm_extraction(tester) - # test_llm_with_ollama(tester) - # test_screenshot(tester) + test_js_execution(tester) + test_css_selector(tester) + test_structured_extraction(tester) + test_llm_extraction(tester) + test_llm_with_ollama(tester) + test_screenshot(tester) def test_basic_crawl(tester: Crawl4AiTester): From 1def53b7fe60267d5bc1f492f50b5f53f8858eee Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 20:53:43 +0800 Subject: [PATCH 105/115] docs: update Raspberry Pi section to indicate upcoming support --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3d89ee19..405c1002 100644 --- a/README.md +++ b/README.md @@ -265,7 +265,7 @@ Test the installation: curl http://localhost:11235/health ``` -### For Raspberry Pi (32-bit) (Experimental) +### For Raspberry Pi (32-bit) (coming soon): ```bash # Pull and run basic version (recommended for Raspberry Pi) docker pull unclecode/crawl4ai:basic-armv7 From 1ed7c15118fc81427fa29afe6368eb2a47720fd4 Mon Sep 17 00:00:00 2001 From: dvschuyl <125589423+dvschuyl@users.noreply.github.com> Date: Fri, 29 Nov 2024 14:06:04 +0100 Subject: [PATCH 106/115] :adhesive_bandage: Page-evaluate navigation destroyed error (#304) Thanks for your contribution and such a nice approach. Now that I think of it, I guess I can make good use of this for some other part of the code. By the way, thank you so much; I will add your name to the new list of contributors. --- crawl4ai/async_crawler_strategy.py | 1 + 1 file changed, 1 insertion(+) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index e5316187..a41d29a8 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -920,6 +920,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): }); } """ + await page.wait_for_load_state() await page.evaluate(update_image_dimensions_js) # Wait a bit for any onload events to complete From 0780db55e1298e73178077ec0bdc65cd534faa8d Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 21:12:19 +0800 Subject: [PATCH 107/115] fix: handle errors during image dimension updates in AsyncPlaywrightCrawlerStrategy --- crawl4ai/async_crawler_strategy.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index e5316187..cc7f3993 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -920,7 +920,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): }); } """ - await page.evaluate(update_image_dimensions_js) + try: + await page.wait_for_load_state() + await page.evaluate(update_image_dimensions_js) + except Exception as e: + raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}") # Wait a bit for any onload events to complete await page.wait_for_timeout(100) From 8c76a8c7dcb2820a351eeb5696db2fc04fce7805 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 21:14:49 +0800 Subject: [PATCH 108/115] docs: add contributor entry for dvschuyl regarding AsyncPlaywrightCrawlerStrategy issue --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 663e5541..79038bdd 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -18,6 +18,7 @@ We would like to thank the following people for their contributions to Crawl4AI: ## Pull Requests +- [dvschuyl](https://github.com/dvschuyl) - AsyncPlaywrightCrawlerStrategy page-evaluate context destroyed by navigation [#304](https://github.com/unclecode/crawl4ai/pull/304) - [nelzomal](https://github.com/nelzomal) - Enhance development installation instructions [#286](https://github.com/unclecode/crawl4ai/pull/286) - [HamzaFarhan](https://github.com/HamzaFarhan) - Handled the cases where markdown_with_citations, references_markdown, and filtered_html might not be defined [#293](https://github.com/unclecode/crawl4ai/pull/293) - [NanmiCoder](https://github.com/NanmiCoder) - fix: crawler strategy exception handling and fixes [#271](https://github.com/unclecode/crawl4ai/pull/271) From 3e83893b3f41b7176f6ec0beaccab9f2b159785d Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 30 Nov 2024 18:13:12 +0800 Subject: [PATCH 109/115] Enhance User-Agent Handling - Added a new UserAgentGenerator class for generating random User-Agents. - Integrated User-Agent generation in AsyncPlaywrightCrawlerStrategy for randomization. - Enhanced HTTP headers with generated Client Hints. --- crawl4ai/async_crawler_strategy.py | 33 +++- crawl4ai/user_agent_generator.py | 262 +++++++++++++++++++++++++++++ 2 files changed, 289 insertions(+), 6 deletions(-) create mode 100644 crawl4ai/user_agent_generator.py diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index cc7f3993..3d24bd84 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -16,6 +16,7 @@ import json import uuid from .models import AsyncCrawlResponse from .utils import create_box_message +from .user_agent_generator import UserAgentGenerator from playwright_stealth import StealthConfig, stealth_async stealth_config = StealthConfig( @@ -222,14 +223,20 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.use_cached_html = use_cached_html self.user_agent = kwargs.get( "user_agent", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36" ) + user_agenr_generator = UserAgentGenerator() + if kwargs.get("user_agent_mode") == "random": + self.user_agent = user_agenr_generator.generate( + **kwargs.get("user_agent_generator_config", {}) + ) self.proxy = kwargs.get("proxy") self.proxy_config = kwargs.get("proxy_config") self.headless = kwargs.get("headless", True) self.browser_type = kwargs.get("browser_type", "chromium") self.headers = kwargs.get("headers", {}) + self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent) + self.headers.setdefault("sec-ch-ua", self.browser_hint) self.cookies = kwargs.get("cookies", []) self.sessions = {} self.session_ttl = 1800 @@ -307,7 +314,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.user_agent: await self.default_context.set_extra_http_headers({ - "User-Agent": self.user_agent + "User-Agent": self.user_agent, + "sec-ch-ua": self.browser_hint, + # **self.headers }) else: # Base browser arguments @@ -321,7 +330,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "--disable-infobars", "--window-position=0,0", "--ignore-certificate-errors", - "--ignore-certificate-errors-spki-list" + "--ignore-certificate-errors-spki-list", + "--disable-blink-features=AutomationControlled", + ] } @@ -642,6 +653,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self._cleanup_expired_sessions() session_id = kwargs.get("session_id") + # Check if in kwargs we have user_agent that will override the default user_agent + user_agent = kwargs.get("user_agent", self.user_agent) + + # Generate random user agent if magic mode is enabled and user_agent_mode is not random + if kwargs.get("user_agent_mode") != "random" and kwargs.get("magic", False): + user_agent = UserAgentGenerator().generate( + **kwargs.get("user_agent_generator_config", {}) + ) + # Handle page creation differently for managed browser context = None if self.use_managed_browser: @@ -666,7 +686,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): else: # Normal context creation for non-persistent or non-Chrome browsers context = await self.browser.new_context( - user_agent=self.user_agent, + user_agent=user_agent, viewport={"width": 1200, "height": 800}, proxy={"server": self.proxy} if self.proxy else None, java_script_enabled=True, @@ -686,10 +706,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): else: # Normal context creation context = await self.browser.new_context( - user_agent=self.user_agent, + user_agent=user_agent, viewport={"width": 1920, "height": 1080}, proxy={"server": self.proxy} if self.proxy else None, accept_downloads=self.accept_downloads, + ignore_https_errors=True # Add this line ) if self.cookies: await context.add_cookies(self.cookies) diff --git a/crawl4ai/user_agent_generator.py b/crawl4ai/user_agent_generator.py new file mode 100644 index 00000000..0a4df0bb --- /dev/null +++ b/crawl4ai/user_agent_generator.py @@ -0,0 +1,262 @@ +import random +from typing import Optional, Literal, List, Dict, Tuple +import re + + +class UserAgentGenerator: + def __init__(self): + # Previous platform definitions remain the same... + self.desktop_platforms = { + "windows": { + "10_64": "(Windows NT 10.0; Win64; x64)", + "10_32": "(Windows NT 10.0; WOW64)", + }, + "macos": { + "intel": "(Macintosh; Intel Mac OS X 10_15_7)", + "newer": "(Macintosh; Intel Mac OS X 10.15; rv:109.0)", + }, + "linux": { + "generic": "(X11; Linux x86_64)", + "ubuntu": "(X11; Ubuntu; Linux x86_64)", + "chrome_os": "(X11; CrOS x86_64 14541.0.0)", + } + } + + self.mobile_platforms = { + "android": { + "samsung": "(Linux; Android 13; SM-S901B)", + "pixel": "(Linux; Android 12; Pixel 6)", + "oneplus": "(Linux; Android 13; OnePlus 9 Pro)", + "xiaomi": "(Linux; Android 12; M2102J20SG)", + }, + "ios": { + "iphone": "(iPhone; CPU iPhone OS 16_5 like Mac OS X)", + "ipad": "(iPad; CPU OS 16_5 like Mac OS X)", + } + } + + # Browser Combinations + self.browser_combinations = { + 1: [ + ["chrome"], + ["firefox"], + ["safari"], + ["edge"] + ], + 2: [ + ["gecko", "firefox"], + ["chrome", "safari"], + ["webkit", "safari"] + ], + 3: [ + ["chrome", "safari", "edge"], + ["webkit", "chrome", "safari"] + ] + } + + # Rendering Engines with versions + self.rendering_engines = { + "chrome_webkit": "AppleWebKit/537.36", + "safari_webkit": "AppleWebKit/605.1.15", + "gecko": [ # Added Gecko versions + "Gecko/20100101", + "Gecko/20100101", # Firefox usually uses this constant version + "Gecko/2010010", + ] + } + + # Browser Versions + self.chrome_versions = [ + "Chrome/119.0.6045.199", + "Chrome/118.0.5993.117", + "Chrome/117.0.5938.149", + "Chrome/116.0.5845.187", + "Chrome/115.0.5790.171", + ] + + self.edge_versions = [ + "Edg/119.0.2151.97", + "Edg/118.0.2088.76", + "Edg/117.0.2045.47", + "Edg/116.0.1938.81", + "Edg/115.0.1901.203", + ] + + self.safari_versions = [ + "Safari/537.36", # For Chrome-based + "Safari/605.1.15", + "Safari/604.1", + "Safari/602.1", + "Safari/601.5.17", + ] + + # Added Firefox versions + self.firefox_versions = [ + "Firefox/119.0", + "Firefox/118.0.2", + "Firefox/117.0.1", + "Firefox/116.0", + "Firefox/115.0.3", + "Firefox/114.0.2", + "Firefox/113.0.1", + "Firefox/112.0", + "Firefox/111.0.1", + "Firefox/110.0", + ] + + def get_browser_stack(self, num_browsers: int = 1) -> List[str]: + """Get a valid combination of browser versions""" + if num_browsers not in self.browser_combinations: + raise ValueError(f"Unsupported number of browsers: {num_browsers}") + + combination = random.choice(self.browser_combinations[num_browsers]) + browser_stack = [] + + for browser in combination: + if browser == "chrome": + browser_stack.append(random.choice(self.chrome_versions)) + elif browser == "firefox": + browser_stack.append(random.choice(self.firefox_versions)) + elif browser == "safari": + browser_stack.append(random.choice(self.safari_versions)) + elif browser == "edge": + browser_stack.append(random.choice(self.edge_versions)) + elif browser == "gecko": + browser_stack.append(random.choice(self.rendering_engines["gecko"])) + elif browser == "webkit": + browser_stack.append(self.rendering_engines["chrome_webkit"]) + + return browser_stack + + def generate(self, + device_type: Optional[Literal['desktop', 'mobile']] = None, + os_type: Optional[str] = None, + device_brand: Optional[str] = None, + browser_type: Optional[Literal['chrome', 'edge', 'safari', 'firefox']] = None, + num_browsers: int = 3) -> str: + """ + Generate a random user agent with specified constraints. + + Args: + device_type: 'desktop' or 'mobile' + os_type: 'windows', 'macos', 'linux', 'android', 'ios' + device_brand: Specific device brand + browser_type: 'chrome', 'edge', 'safari', or 'firefox' + num_browsers: Number of browser specifications (1-3) + """ + # Get platform string + platform = self.get_random_platform(device_type, os_type, device_brand) + + # Start with Mozilla + components = ["Mozilla/5.0", platform] + + # Add browser stack + browser_stack = self.get_browser_stack(num_browsers) + + # Add appropriate legacy token based on browser stack + if "Firefox" in str(browser_stack): + components.append(random.choice(self.rendering_engines["gecko"])) + elif "Chrome" in str(browser_stack) or "Safari" in str(browser_stack): + components.append(self.rendering_engines["chrome_webkit"]) + components.append("(KHTML, like Gecko)") + + # Add browser versions + components.extend(browser_stack) + + return " ".join(components) + + def generate_with_client_hints(self, **kwargs) -> Tuple[str, str]: + """Generate both user agent and matching client hints""" + user_agent = self.generate(**kwargs) + client_hints = self.generate_client_hints(user_agent) + return user_agent, client_hints + + def get_random_platform(self, device_type, os_type, device_brand): + """Helper method to get random platform based on constraints""" + platforms = self.desktop_platforms if device_type == 'desktop' else \ + self.mobile_platforms if device_type == 'mobile' else \ + {**self.desktop_platforms, **self.mobile_platforms} + + if os_type: + for platform_group in [self.desktop_platforms, self.mobile_platforms]: + if os_type in platform_group: + platforms = {os_type: platform_group[os_type]} + break + + os_key = random.choice(list(platforms.keys())) + if device_brand and device_brand in platforms[os_key]: + return platforms[os_key][device_brand] + return random.choice(list(platforms[os_key].values())) + + def parse_user_agent(self, user_agent: str) -> Dict[str, str]: + """Parse a user agent string to extract browser and version information""" + browsers = { + 'chrome': r'Chrome/(\d+)', + 'edge': r'Edg/(\d+)', + 'safari': r'Version/(\d+)', + 'firefox': r'Firefox/(\d+)' + } + + result = {} + for browser, pattern in browsers.items(): + match = re.search(pattern, user_agent) + if match: + result[browser] = match.group(1) + + return result + + def generate_client_hints(self, user_agent: str) -> str: + """Generate Sec-CH-UA header value based on user agent string""" + browsers = self.parse_user_agent(user_agent) + + # Client hints components + hints = [] + + # Handle different browser combinations + if 'chrome' in browsers: + hints.append(f'"Chromium";v="{browsers["chrome"]}"') + hints.append('"Not_A Brand";v="8"') + + if 'edge' in browsers: + hints.append(f'"Microsoft Edge";v="{browsers["edge"]}"') + else: + hints.append(f'"Google Chrome";v="{browsers["chrome"]}"') + + elif 'firefox' in browsers: + # Firefox doesn't typically send Sec-CH-UA + return '""' + + elif 'safari' in browsers: + # Safari's format for client hints + hints.append(f'"Safari";v="{browsers["safari"]}"') + hints.append('"Not_A Brand";v="8"') + + return ', '.join(hints) + +# Example usage: +if __name__ == "__main__": + generator = UserAgentGenerator() + + print("\nSingle browser (Chrome):") + print(generator.generate(num_browsers=1, browser_type='chrome')) + + print("\nTwo browsers (Gecko/Firefox):") + print(generator.generate(num_browsers=2)) + + print("\nThree browsers (Chrome/Safari/Edge):") + print(generator.generate(num_browsers=3)) + + print("\nFirefox on Linux:") + print(generator.generate( + device_type='desktop', + os_type='linux', + browser_type='firefox', + num_browsers=2 + )) + + print("\nChrome/Safari/Edge on Windows:") + print(generator.generate( + device_type='desktop', + os_type='windows', + num_browsers=3 + )) \ No newline at end of file From 80d58ad24c64e30ab0c037496de89952516b772e Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 30 Nov 2024 22:00:15 +0800 Subject: [PATCH 110/115] bump version to 0.3.747 --- crawl4ai/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 4a938b75..189a2955 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.746" +__version__ = "0.3.747" From 293f299c083aab97aa06e8a06045caa7273aae15 Mon Sep 17 00:00:00 2001 From: unclecode Date: Sun, 1 Dec 2024 19:17:33 +0800 Subject: [PATCH 111/115] Add PruningContentFilter with unit tests and update documentation - Introduced the PruningContentFilter for better content relevance. - Implemented comprehensive unit tests for verification of functionality. - Enhanced existing BM25ContentFilter tests for edge case coverage. - Updated documentation to include usage examples for new filter. --- CHANGELOG.md | 50 +++ README.md | 7 +- crawl4ai/content_filter_strategy.py | 285 ++++++++++-------- crawl4ai/content_scraping_strategy.py | 13 +- docs/examples/quickstart_async.py | 8 +- docs/md_v2/advanced/managed_browser.md | 54 +++- docs/md_v2/basic/content_filtering.md | 58 +++- ..._filter.py => test_content_filter_bm25.py} | 0 tests/async/test_content_filter_prune.py | 159 ++++++++++ 9 files changed, 499 insertions(+), 135 deletions(-) rename tests/async/{test_content_filter.py => test_content_filter_bm25.py} (100%) create mode 100644 tests/async/test_content_filter_prune.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 309218dc..03a7afb0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,55 @@ # Changelog +## [0.3.75] December 1, 2024 + +### PruningContentFilter + +#### 1. Introduced PruningContentFilter (Dec 01, 2024) (Dec 01, 2024) +A new content filtering strategy that removes less relevant nodes based on metrics like text and link density. + +**Affected Files:** +- `crawl4ai/content_filter_strategy.py`: Enhancement of content filtering capabilities. +```diff +Implemented effective pruning algorithm with comprehensive scoring. +``` +- `README.md`: Improved documentation regarding new features. +```diff +Updated to include usage and explanation for the PruningContentFilter. +``` +- `docs/md_v2/basic/content_filtering.md`: Expanded documentation for users. +```diff +Added detailed section explaining the PruningContentFilter. +``` + +#### 2. Added Unit Tests for PruningContentFilter (Dec 01, 2024) (Dec 01, 2024) +Comprehensive tests added to ensure correct functionality of PruningContentFilter + +**Affected Files:** +- `tests/async/test_content_filter_prune.py`: Increased test coverage for content filtering strategies. +```diff +Created test cases for various scenarios using the PruningContentFilter. +``` + +### Development Updates + +#### 3. Enhanced BM25ContentFilter tests (Dec 01, 2024) (Dec 01, 2024) +Extended testing to cover additional edge cases and performance metrics. + +**Affected Files:** +- `tests/async/test_content_filter_bm25.py`: Improved reliability and performance assurance. +```diff +Added tests for new extraction scenarios including malformed HTML. +``` + +### Infrastructure & Documentation + +#### 4. Updated Examples (Dec 01, 2024) (Dec 01, 2024) +Altered examples in documentation to promote the use of PruningContentFilter alongside existing strategies. + +**Affected Files:** +- `docs/examples/quickstart_async.py`: Enhanced usability and clarity for new users. +- Revised example to illustrate usage of PruningContentFilter. + ## [0.3.746] November 29, 2024 ### Major Features diff --git a/README.md b/README.md index 405c1002..d70af8ad 100644 --- a/README.md +++ b/README.md @@ -422,7 +422,7 @@ You can check the project structure in the directory [https://github.com/uncleco ```python import asyncio from crawl4ai import AsyncWebCrawler, CacheMode -from crawl4ai.content_filter_strategy import BM25ContentFilter +from crawl4ai.content_filter_strategy import PruningContentFilter, BM25ContentFilter from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator async def main(): @@ -434,8 +434,11 @@ async def main(): url="https://docs.micronaut.io/4.7.6/guide/", cache_mode=CacheMode.ENABLED, markdown_generator=DefaultMarkdownGenerator( - content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0) + content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0) ), + # markdown_generator=DefaultMarkdownGenerator( + # content_filter=BM25ContentFilter(user_query="WHEN_WE_FOCUS_BASED_ON_A_USER_QUERY", bm25_threshold=1.0) + # ), ) print(len(result.markdown)) print(len(result.fit_markdown)) diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py index e6891a3f..ca3868bb 100644 --- a/crawl4ai/content_filter_strategy.py +++ b/crawl4ai/content_filter_strategy.py @@ -4,10 +4,10 @@ from typing import List, Tuple, Dict from rank_bm25 import BM25Okapi from time import perf_counter from collections import deque -from bs4 import BeautifulSoup, NavigableString, Tag +from bs4 import BeautifulSoup, NavigableString, Tag, Comment from .utils import clean_tokens from abc import ABC, abstractmethod - +import math from snowballstemmer import stemmer @@ -358,145 +358,186 @@ class BM25ContentFilter(RelevantContentFilter): return [self.clean_element(tag) for _, _, tag in selected_candidates] -class HeuristicContentFilter(RelevantContentFilter): - def __init__(self): - super().__init__() - # Weights for different heuristics - self.tag_weights = { - 'article': 10, - 'main': 8, - 'section': 5, - 'div': 3, - 'p': 2, - 'pre': 2, - 'code': 2, - 'blockquote': 2, - 'li': 1, - 'span': 1, - } - self.max_depth = 5 # Maximum depth from body to consider - def filter_content(self, html: str) -> List[str]: - """Implements heuristic content filtering without relying on a query.""" + + + +class PruningContentFilter(RelevantContentFilter): + def __init__(self, user_query: str = None, min_word_threshold: int = None, + threshold_type: str = 'fixed', threshold: float = 0.48): + super().__init__(user_query) + self.min_word_threshold = min_word_threshold + self.threshold_type = threshold_type + self.threshold = threshold + + # Add tag importance for dynamic threshold + self.tag_importance = { + 'article': 1.5, + 'main': 1.4, + 'section': 1.3, + 'p': 1.2, + 'h1': 1.4, + 'h2': 1.3, + 'h3': 1.2, + 'div': 0.7, + 'span': 0.6 + } + + # Metric configuration + self.metric_config = { + 'text_density': True, + 'link_density': True, + 'tag_weight': True, + 'class_id_weight': True, + 'text_length': True, + } + + self.metric_weights = { + 'text_density': 0.4, + 'link_density': 0.2, + 'tag_weight': 0.2, + 'class_id_weight': 0.1, + 'text_length': 0.1, + } + + self.tag_weights = { + 'div': 0.5, + 'p': 1.0, + 'article': 1.5, + 'section': 1.0, + 'span': 0.3, + 'li': 0.5, + 'ul': 0.5, + 'ol': 0.5, + 'h1': 1.2, + 'h2': 1.1, + 'h3': 1.0, + 'h4': 0.9, + 'h5': 0.8, + 'h6': 0.7, + } + + def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]: if not html or not isinstance(html, str): return [] - + soup = BeautifulSoup(html, 'lxml') - - # Ensure there is a body tag if not soup.body: soup = BeautifulSoup(f'{html}', 'lxml') - body = soup.body + + # Remove comments and unwanted tags + self._remove_comments(soup) + self._remove_unwanted_tags(soup) + + # Prune tree starting from body + body = soup.find('body') + self._prune_tree(body) + + # Extract remaining content as list of HTML strings + content_blocks = [] + for element in body.children: + if isinstance(element, str) or not hasattr(element, 'name'): + continue + if len(element.get_text(strip=True)) > 0: + content_blocks.append(str(element)) + + return content_blocks - # Extract candidate text chunks - candidates = self.extract_text_chunks(body) + def _remove_comments(self, soup): + for element in soup(text=lambda text: isinstance(text, Comment)): + element.extract() - if not candidates: - return [] + def _remove_unwanted_tags(self, soup): + for tag in self.excluded_tags: + for element in soup.find_all(tag): + element.decompose() - # Score each candidate - scored_candidates = [] - for index, text, tag_type, tag in candidates: - score = self.score_element(tag, text) - if score > 0: - scored_candidates.append((score, index, text, tag)) + def _prune_tree(self, node): + if not node or not hasattr(node, 'name') or node.name is None: + return - # Sort candidates by score and then by document order - scored_candidates.sort(key=lambda x: (-x[0], x[1])) + text_len = len(node.get_text(strip=True)) + tag_len = len(node.encode_contents().decode('utf-8')) + link_text_len = sum(len(s.strip()) for s in (a.string for a in node.find_all('a', recursive=False)) if s) - # Extract the top candidates (e.g., top 5) - top_candidates = scored_candidates[:5] # Adjust the number as needed + metrics = { + 'node': node, + 'tag_name': node.name, + 'text_len': text_len, + 'tag_len': tag_len, + 'link_text_len': link_text_len + } - # Sort the top candidates back to their original document order - top_candidates.sort(key=lambda x: x[1]) + score = self._compute_composite_score(metrics) - # Clean and return the content - return [self.clean_element(tag) for _, _, _, tag in top_candidates] + if self.threshold_type == 'fixed': + should_remove = score < self.threshold + else: # dynamic + tag_importance = self.tag_importance.get(node.name, 0.7) + text_ratio = text_len / tag_len if tag_len > 0 else 0 + link_ratio = link_text_len / text_len if text_len > 0 else 1 + + threshold = self.threshold # base threshold + if tag_importance > 1: + threshold *= 0.8 + if text_ratio > 0.4: + threshold *= 0.9 + if link_ratio > 0.6: + threshold *= 1.2 + + should_remove = score < threshold - def score_element(self, tag: Tag, text: str) -> float: - """Compute a score for an element based on heuristics.""" - if not text or not tag: - return 0 + if should_remove: + node.decompose() + else: + children = [child for child in node.children if hasattr(child, 'name')] + for child in children: + self._prune_tree(child) - # Exclude unwanted tags - if self.is_excluded(tag): - return 0 + def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len): + if self.min_word_threshold: + # Get raw text from metrics node - avoid extra processing + text = metrics['node'].get_text(strip=True) + word_count = text.count(' ') + 1 + if word_count < self.min_word_threshold: + return -1.0 # Guaranteed removal + score = 0.0 + total_weight = 0.0 - # Text density - text_length = len(text.strip()) - html_length = len(str(tag)) - text_density = text_length / html_length if html_length > 0 else 0 + if self.metric_config['text_density']: + density = text_len / tag_len if tag_len > 0 else 0 + score += self.metric_weights['text_density'] * density + total_weight += self.metric_weights['text_density'] - # Link density - link_text_length = sum(len(a.get_text().strip()) for a in tag.find_all('a')) - link_density = link_text_length / text_length if text_length > 0 else 0 + if self.metric_config['link_density']: + density = 1 - (link_text_len / text_len if text_len > 0 else 0) + score += self.metric_weights['link_density'] * density + total_weight += self.metric_weights['link_density'] - # Tag weight - tag_weight = self.tag_weights.get(tag.name, 1) + if self.metric_config['tag_weight']: + tag_score = self.tag_weights.get(metrics['tag_name'], 0.5) + score += self.metric_weights['tag_weight'] * tag_score + total_weight += self.metric_weights['tag_weight'] - # Depth factor (prefer elements closer to the body tag) - depth = self.get_depth(tag) - depth_weight = max(self.max_depth - depth, 1) / self.max_depth + if self.metric_config['class_id_weight']: + class_score = self._compute_class_id_weight(metrics['node']) + score += self.metric_weights['class_id_weight'] * max(0, class_score) + total_weight += self.metric_weights['class_id_weight'] - # Compute the final score - score = (text_density * tag_weight * depth_weight) / (1 + link_density) + if self.metric_config['text_length']: + score += self.metric_weights['text_length'] * math.log(text_len + 1) + total_weight += self.metric_weights['text_length'] - return score + return score / total_weight if total_weight > 0 else 0 - def get_depth(self, tag: Tag) -> int: - """Compute the depth of the tag from the body tag.""" - depth = 0 - current = tag - while current and current != current.parent and current.name != 'body': - current = current.parent - depth += 1 - return depth - - def extract_text_chunks(self, body: Tag) -> List[Tuple[int, str, str, Tag]]: - """ - Extracts text chunks from the body element while preserving order. - Returns list of tuples (index, text, tag_type, tag) for scoring. - """ - chunks = [] - index = 0 - - def traverse(element): - nonlocal index - if isinstance(element, NavigableString): - return - if not isinstance(element, Tag): - return - if self.is_excluded(element): - return - # Only consider included tags - if element.name in self.included_tags: - text = element.get_text(separator=' ', strip=True) - if len(text.split()) >= self.min_word_count: - tag_type = 'header' if element.name in self.header_tags else 'content' - chunks.append((index, text, tag_type, element)) - index += 1 - # Do not traverse children of this element to prevent duplication - return - for child in element.children: - traverse(child) - - traverse(body) - return chunks - - def is_excluded(self, tag: Tag) -> bool: - """Determine if a tag should be excluded based on heuristics.""" - if tag.name in self.excluded_tags: - return True - class_id = ' '.join(filter(None, [ - ' '.join(tag.get('class', [])), - tag.get('id', '') - ])) - if self.negative_patterns.search(class_id): - return True - # Exclude tags with high link density (e.g., navigation menus) - text = tag.get_text(separator=' ', strip=True) - link_text_length = sum(len(a.get_text(strip=True)) for a in tag.find_all('a')) - text_length = len(text) - if text_length > 0 and (link_text_length / text_length) > 0.5: - return True - return False + def _compute_class_id_weight(self, node): + class_id_score = 0 + if 'class' in node.attrs: + classes = ' '.join(node['class']) + if self.negative_patterns.match(classes): + class_id_score -= 0.5 + if 'id' in node.attrs: + element_id = node['id'] + if self.negative_patterns.match(element_id): + class_id_score -= 0.5 + return class_id_score \ No newline at end of file diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index ec6c3361..de8894b7 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -9,7 +9,7 @@ from bs4 import element, NavigableString, Comment from urllib.parse import urljoin from requests.exceptions import InvalidSchema # from .content_cleaning_strategy import ContentCleaningStrategy -from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter +from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter, PruningContentFilter from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator from .models import MarkdownGenerationResult from .utils import ( @@ -110,10 +110,15 @@ class WebScrapingStrategy(ContentScrapingStrategy): if markdown_generator: try: if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter: - markdown_generator.content_filter = BM25ContentFilter( - user_query=kwargs.get('fit_markdown_user_query', None), - bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) + markdown_generator.content_filter = PruningContentFilter( + threshold_type=kwargs.get('fit_markdown_treshold_type', 'fixed'), + threshold=kwargs.get('fit_markdown_treshold', 0.48), + min_word_threshold=kwargs.get('fit_markdown_min_word_threshold', ), ) + # markdown_generator.content_filter = BM25ContentFilter( + # user_query=kwargs.get('fit_markdown_user_query', None), + # bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) + # ) markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( cleaned_html=cleaned_html, diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 679a9bc2..73d695c3 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -15,7 +15,7 @@ from bs4 import BeautifulSoup from pydantic import BaseModel, Field from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator -from crawl4ai.content_filter_strategy import BM25ContentFilter +from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter from crawl4ai.extraction_strategy import ( JsonCssExtractionStrategy, LLMExtractionStrategy, @@ -466,7 +466,8 @@ async def speed_comparison(): url="https://www.nbcnews.com/business", word_count_threshold=0, markdown_generator=DefaultMarkdownGenerator( - content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0) + content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0) + # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0) ), cache_mode=CacheMode.BYPASS, verbose=False, @@ -489,7 +490,8 @@ async def speed_comparison(): word_count_threshold=0, cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator( - content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0) + content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0) + # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0) ), verbose=False, ) diff --git a/docs/md_v2/advanced/managed_browser.md b/docs/md_v2/advanced/managed_browser.md index 80d6fc1a..0d327f2e 100644 --- a/docs/md_v2/advanced/managed_browser.md +++ b/docs/md_v2/advanced/managed_browser.md @@ -4,7 +4,59 @@ This guide explains how to use content filtering strategies in Crawl4AI to extra ## Relevance Content Filter -The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks. +The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `PruningContentFilter` or `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks. + + +## Pruning Content Filter + +The `PruningContentFilter` is a tree-shaking algorithm that analyzes the HTML DOM structure and removes less relevant nodes based on various metrics like text density, link density, and tag importance. It evaluates each node using a composite scoring system and "prunes" nodes that fall below a certain threshold. + +### Usage + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.content_filter_strategy import PruningContentFilter + +async def filter_content(url): + async with AsyncWebCrawler() as crawler: + content_filter = PruningContentFilter( + min_word_threshold=5, + threshold_type='dynamic', + threshold=0.45 + ) + result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) + if result.success: + print(f"Cleaned Markdown:\n{result.fit_markdown}") +``` + +### Parameters + +- **`min_word_threshold`**: (Optional) Minimum number of words a node must contain to be considered relevant. Nodes with fewer words are automatically pruned. + +- **`threshold_type`**: (Optional, default 'fixed') Controls how pruning thresholds are calculated: + - `'fixed'`: Uses a constant threshold value for all nodes + - `'dynamic'`: Adjusts threshold based on node characteristics like tag importance and text/link ratios + +- **`threshold`**: (Optional, default 0.48) Base threshold value for node pruning: + - For fixed threshold: Nodes scoring below this value are removed + - For dynamic threshold: This value is adjusted based on node properties + +### How It Works + +The pruning algorithm evaluates each node using multiple metrics: +- Text density: Ratio of actual text to overall node content +- Link density: Proportion of text within links +- Tag importance: Weight based on HTML tag type (e.g., article, p, div) +- Content quality: Metrics like text length and structural importance + +Nodes scoring below the threshold are removed, effectively "shaking" less relevant content from the DOM tree. This results in a cleaner document containing only the most relevant content blocks. + +The algorithm is particularly effective for: +- Removing boilerplate content +- Eliminating navigation menus and sidebars +- Preserving main article content +- Maintaining document structure while removing noise + ## BM25 Algorithm diff --git a/docs/md_v2/basic/content_filtering.md b/docs/md_v2/basic/content_filtering.md index 9506c075..0d327f2e 100644 --- a/docs/md_v2/basic/content_filtering.md +++ b/docs/md_v2/basic/content_filtering.md @@ -4,7 +4,59 @@ This guide explains how to use content filtering strategies in Crawl4AI to extra ## Relevance Content Filter -The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks. +The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `PruningContentFilter` or `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks. + + +## Pruning Content Filter + +The `PruningContentFilter` is a tree-shaking algorithm that analyzes the HTML DOM structure and removes less relevant nodes based on various metrics like text density, link density, and tag importance. It evaluates each node using a composite scoring system and "prunes" nodes that fall below a certain threshold. + +### Usage + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.content_filter_strategy import PruningContentFilter + +async def filter_content(url): + async with AsyncWebCrawler() as crawler: + content_filter = PruningContentFilter( + min_word_threshold=5, + threshold_type='dynamic', + threshold=0.45 + ) + result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) + if result.success: + print(f"Cleaned Markdown:\n{result.fit_markdown}") +``` + +### Parameters + +- **`min_word_threshold`**: (Optional) Minimum number of words a node must contain to be considered relevant. Nodes with fewer words are automatically pruned. + +- **`threshold_type`**: (Optional, default 'fixed') Controls how pruning thresholds are calculated: + - `'fixed'`: Uses a constant threshold value for all nodes + - `'dynamic'`: Adjusts threshold based on node characteristics like tag importance and text/link ratios + +- **`threshold`**: (Optional, default 0.48) Base threshold value for node pruning: + - For fixed threshold: Nodes scoring below this value are removed + - For dynamic threshold: This value is adjusted based on node properties + +### How It Works + +The pruning algorithm evaluates each node using multiple metrics: +- Text density: Ratio of actual text to overall node content +- Link density: Proportion of text within links +- Tag importance: Weight based on HTML tag type (e.g., article, p, div) +- Content quality: Metrics like text length and structural importance + +Nodes scoring below the threshold are removed, effectively "shaking" less relevant content from the DOM tree. This results in a cleaner document containing only the most relevant content blocks. + +The algorithm is particularly effective for: +- Removing boilerplate content +- Eliminating navigation menus and sidebars +- Preserving main article content +- Maintaining document structure while removing noise + ## BM25 Algorithm @@ -21,7 +73,7 @@ from crawl4ai.content_filter_strategy import BM25ContentFilter async def filter_content(url, query=None): async with AsyncWebCrawler() as crawler: content_filter = BM25ContentFilter(user_query=query) - result = await crawler.arun(url=url, content_filter=content_filter, fit_markdown=True) # Set fit_markdown flag to True to trigger BM25 filtering + result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) # Set fit_markdown flag to True to trigger BM25 filtering if result.success: print(f"Filtered Content (JSON):\n{result.extracted_content}") print(f"\nFiltered Markdown:\n{result.fit_markdown}") # New field in CrawlResult object @@ -71,7 +123,7 @@ class MyCustomFilter(RelevantContentFilter): async def custom_filter_demo(url: str): async with AsyncWebCrawler() as crawler: custom_filter = MyCustomFilter() - result = await crawler.arun(url, content_filter=custom_filter) + result = await crawler.arun(url, extraction_strategy=custom_filter) if result.success: print(result.extracted_content) diff --git a/tests/async/test_content_filter.py b/tests/async/test_content_filter_bm25.py similarity index 100% rename from tests/async/test_content_filter.py rename to tests/async/test_content_filter_bm25.py diff --git a/tests/async/test_content_filter_prune.py b/tests/async/test_content_filter_prune.py new file mode 100644 index 00000000..23b0fa3a --- /dev/null +++ b/tests/async/test_content_filter_prune.py @@ -0,0 +1,159 @@ +import os, sys +import pytest +from bs4 import BeautifulSoup + +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) + +from crawl4ai.content_filter_strategy import PruningContentFilter + +@pytest.fixture +def basic_html(): + return """ + + +
    +

    Main Article

    +

    This is a high-quality paragraph with substantial text content. It contains enough words to pass the threshold and has good text density without too many links. This kind of content should survive the pruning process.

    + + +
    + + + """ + +@pytest.fixture +def link_heavy_html(): + return """ + + +
    +

    Good content paragraph that should remain.

    + +
    + + + """ + +@pytest.fixture +def mixed_content_html(): + return """ + + +
    +

    Article Title

    +

    Short summary.

    +
    +

    Long high-quality paragraph with substantial content that should definitely survive the pruning process. This content has good text density and proper formatting which makes it valuable for retention.

    +
    +
    +

    Short comment 1

    +

    Short comment 2

    +
    +
    + + + """ + +class TestPruningContentFilter: + def test_basic_pruning(self, basic_html): + """Test basic content pruning functionality""" + filter = PruningContentFilter(min_word_threshold=5) + contents = filter.filter_content(basic_html) + + combined_content = ' '.join(contents).lower() + assert "high-quality paragraph" in combined_content + assert "sidebar content" not in combined_content + assert "share buttons" not in combined_content + + def test_min_word_threshold(self, mixed_content_html): + """Test minimum word threshold filtering""" + filter = PruningContentFilter(min_word_threshold=10) + contents = filter.filter_content(mixed_content_html) + + combined_content = ' '.join(contents).lower() + assert "short summary" not in combined_content + assert "long high-quality paragraph" in combined_content + assert "short comment" not in combined_content + + def test_threshold_types(self, basic_html): + """Test fixed vs dynamic thresholds""" + fixed_filter = PruningContentFilter(threshold_type='fixed', threshold=0.48) + dynamic_filter = PruningContentFilter(threshold_type='dynamic', threshold=0.45) + + fixed_contents = fixed_filter.filter_content(basic_html) + dynamic_contents = dynamic_filter.filter_content(basic_html) + + assert len(fixed_contents) != len(dynamic_contents), \ + "Fixed and dynamic thresholds should yield different results" + + def test_link_density_impact(self, link_heavy_html): + """Test handling of link-heavy content""" + filter = PruningContentFilter(threshold_type='dynamic') + contents = filter.filter_content(link_heavy_html) + + combined_content = ' '.join(contents).lower() + assert "good content paragraph" in combined_content + assert len([c for c in contents if 'href' in c]) < 2, \ + "Should prune link-heavy sections" + + def test_tag_importance(self, mixed_content_html): + """Test tag importance in scoring""" + filter = PruningContentFilter(threshold_type='dynamic') + contents = filter.filter_content(mixed_content_html) + + has_article = any('article' in c.lower() for c in contents) + has_h1 = any('h1' in c.lower() for c in contents) + assert has_article or has_h1, "Should retain important tags" + + def test_empty_input(self): + """Test handling of empty input""" + filter = PruningContentFilter() + assert filter.filter_content("") == [] + assert filter.filter_content(None) == [] + + def test_malformed_html(self): + """Test handling of malformed HTML""" + malformed_html = "
    Unclosed div

    Nestedcontent

    " + filter = PruningContentFilter() + contents = filter.filter_content(malformed_html) + assert isinstance(contents, list) + + def test_performance(self, basic_html): + """Test performance with timer""" + filter = PruningContentFilter() + + import time + start = time.perf_counter() + filter.filter_content(basic_html) + duration = time.perf_counter() - start + + # Extra strict on performance since you mentioned milliseconds matter + assert duration < 0.1, f"Processing took too long: {duration:.3f} seconds" + + @pytest.mark.parametrize("threshold,expected_count", [ + (0.3, 4), # Very lenient + (0.48, 2), # Default + (0.7, 1), # Very strict + ]) + def test_threshold_levels(self, mixed_content_html, threshold, expected_count): + """Test different threshold levels""" + filter = PruningContentFilter(threshold_type='fixed', threshold=threshold) + contents = filter.filter_content(mixed_content_html) + assert len(contents) <= expected_count, \ + f"Expected {expected_count} or fewer elements with threshold {threshold}" + + def test_consistent_output(self, basic_html): + """Test output consistency across multiple runs""" + filter = PruningContentFilter() + first_run = filter.filter_content(basic_html) + second_run = filter.filter_content(basic_html) + assert first_run == second_run, "Output should be consistent" + +if __name__ == "__main__": + pytest.main([__file__]) \ No newline at end of file From 95a4f74d2a9c0ae8c6f727cce6f6d0c17694aeb4 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 2 Dec 2024 20:37:28 +0800 Subject: [PATCH 112/115] fix: pass logger to WebScrapingStrategy and update score computation in PruningContentFilter --- crawl4ai/async_webcrawler.py | 4 +++- crawl4ai/content_filter_strategy.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 66b4c21b..8db69333 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -472,7 +472,9 @@ class AsyncWebCrawler: try: _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" t1 = time.perf_counter() - scrapping_strategy = WebScrapingStrategy() + scrapping_strategy = WebScrapingStrategy( + logger=self.logger, + ) # result = await scrapping_strategy.ascrap( result = scrapping_strategy.scrap( url, diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py index ca3868bb..f05b92fa 100644 --- a/crawl4ai/content_filter_strategy.py +++ b/crawl4ai/content_filter_strategy.py @@ -468,7 +468,7 @@ class PruningContentFilter(RelevantContentFilter): 'link_text_len': link_text_len } - score = self._compute_composite_score(metrics) + score = self._compute_composite_score(metrics, text_len, tag_len, link_text_len) if self.threshold_type == 'fixed': should_remove = score < self.threshold From e9639ad18972d11929823ff9b1bb9794ad938750 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 3 Dec 2024 19:44:38 +0800 Subject: [PATCH 113/115] refactor: improve error handling in DataProcessor and optimize data parsing logic --- crawl4ai/async_webcrawler.py | 305 ++++++++++++++++++----------------- 1 file changed, 155 insertions(+), 150 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 8db69333..2c17602d 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -7,6 +7,7 @@ from pathlib import Path from typing import Optional, List, Union import json import asyncio +from contextlib import nullcontext from .models import CrawlResult, MarkdownGenerationResult from .async_database import async_db_manager from .chunking_strategy import * @@ -67,6 +68,7 @@ class AsyncWebCrawler: always_bypass_cache: bool = False, always_by_pass_cache: Optional[bool] = None, # Deprecated parameter base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), + thread_safe: bool = False, **kwargs, ): """ @@ -104,6 +106,8 @@ class AsyncWebCrawler: else: self.always_bypass_cache = always_bypass_cache + self._lock = asyncio.Lock() if thread_safe else None + self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai") os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) @@ -178,169 +182,170 @@ class AsyncWebCrawler: Returns: CrawlResult: The result of crawling and processing """ - try: - # Handle deprecated parameters - if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): - if kwargs.get("warning", True): - warnings.warn( - "Cache control boolean flags are deprecated and will be removed in version X.X.X. " - "Use 'cache_mode' parameter instead. Examples:\n" - "- For bypass_cache=True, use cache_mode=CacheMode.BYPASS\n" - "- For disable_cache=True, use cache_mode=CacheMode.DISABLED\n" - "- For no_cache_read=True, use cache_mode=CacheMode.WRITE_ONLY\n" - "- For no_cache_write=True, use cache_mode=CacheMode.READ_ONLY\n" - "Pass warning=False to suppress this warning.", - DeprecationWarning, - stacklevel=2 - ) + async with self._lock or nullcontext(): + try: + # Handle deprecated parameters + if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): + if kwargs.get("warning", True): + warnings.warn( + "Cache control boolean flags are deprecated and will be removed in version X.X.X. " + "Use 'cache_mode' parameter instead. Examples:\n" + "- For bypass_cache=True, use cache_mode=CacheMode.BYPASS\n" + "- For disable_cache=True, use cache_mode=CacheMode.DISABLED\n" + "- For no_cache_read=True, use cache_mode=CacheMode.WRITE_ONLY\n" + "- For no_cache_write=True, use cache_mode=CacheMode.READ_ONLY\n" + "Pass warning=False to suppress this warning.", + DeprecationWarning, + stacklevel=2 + ) + + # Convert legacy parameters if cache_mode not provided + if cache_mode is None: + cache_mode = _legacy_to_cache_mode( + disable_cache=disable_cache, + bypass_cache=bypass_cache, + no_cache_read=no_cache_read, + no_cache_write=no_cache_write + ) - # Convert legacy parameters if cache_mode not provided + # Default to ENABLED if no cache mode specified if cache_mode is None: - cache_mode = _legacy_to_cache_mode( - disable_cache=disable_cache, - bypass_cache=bypass_cache, - no_cache_read=no_cache_read, - no_cache_write=no_cache_write + cache_mode = CacheMode.ENABLED + + # Create cache context + cache_context = CacheContext(url, cache_mode, self.always_bypass_cache) + + extraction_strategy = extraction_strategy or NoExtractionStrategy() + extraction_strategy.verbose = verbose + if not isinstance(extraction_strategy, ExtractionStrategy): + raise ValueError("Unsupported extraction strategy") + if not isinstance(chunking_strategy, ChunkingStrategy): + raise ValueError("Unsupported chunking strategy") + + word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD) + + async_response: AsyncCrawlResponse = None + cached_result = None + screenshot_data = None + extracted_content = None + + start_time = time.perf_counter() + + # Try to get cached result if appropriate + if cache_context.should_read(): + cached_result = await async_db_manager.aget_cached_url(url) + + if cached_result: + html = sanitize_input_encode(cached_result.html) + extracted_content = sanitize_input_encode(cached_result.extracted_content or "") + if screenshot: + screenshot_data = cached_result.screenshot + if not screenshot_data: + cached_result = None + # if verbose: + # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s") + self.logger.url_status( + url=cache_context.display_url, + success=bool(html), + timing=time.perf_counter() - start_time, + tag="FETCH" + ) + + + # Fetch fresh content if needed + if not cached_result or not html: + t1 = time.perf_counter() + + if user_agent: + self.crawler_strategy.update_user_agent(user_agent) + async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl( + url, + screenshot=screenshot, + **kwargs ) - - # Default to ENABLED if no cache mode specified - if cache_mode is None: - cache_mode = CacheMode.ENABLED - - # Create cache context - cache_context = CacheContext(url, cache_mode, self.always_bypass_cache) - - extraction_strategy = extraction_strategy or NoExtractionStrategy() - extraction_strategy.verbose = verbose - if not isinstance(extraction_strategy, ExtractionStrategy): - raise ValueError("Unsupported extraction strategy") - if not isinstance(chunking_strategy, ChunkingStrategy): - raise ValueError("Unsupported chunking strategy") - - word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD) - - async_response: AsyncCrawlResponse = None - cached_result = None - screenshot_data = None - extracted_content = None - - start_time = time.perf_counter() - - # Try to get cached result if appropriate - if cache_context.should_read(): - cached_result = await async_db_manager.aget_cached_url(url) - - if cached_result: - html = sanitize_input_encode(cached_result.html) - extracted_content = sanitize_input_encode(cached_result.extracted_content or "") - if screenshot: - screenshot_data = cached_result.screenshot - if not screenshot_data: - cached_result = None - # if verbose: - # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s") - self.logger.url_status( + html = sanitize_input_encode(async_response.html) + screenshot_data = async_response.screenshot + t2 = time.perf_counter() + self.logger.url_status( url=cache_context.display_url, success=bool(html), - timing=time.perf_counter() - start_time, + timing=t2 - t1, tag="FETCH" - ) + ) + # if verbose: + # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") - - # Fetch fresh content if needed - if not cached_result or not html: - t1 = time.perf_counter() + # Process the HTML content + crawl_result = await self.aprocess_html( + url=url, + html=html, + extracted_content=extracted_content, + word_count_threshold=word_count_threshold, + extraction_strategy=extraction_strategy, + chunking_strategy=chunking_strategy, + content_filter=content_filter, + css_selector=css_selector, + screenshot=screenshot_data, + verbose=verbose, + is_cached=bool(cached_result), + async_response=async_response, + is_web_url=cache_context.is_web_url, + is_local_file=cache_context.is_local_file, + is_raw_html=cache_context.is_raw_html, + **kwargs, + ) - if user_agent: - self.crawler_strategy.update_user_agent(user_agent) - async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl( - url, - screenshot=screenshot, - **kwargs - ) - html = sanitize_input_encode(async_response.html) - screenshot_data = async_response.screenshot - t2 = time.perf_counter() - self.logger.url_status( - url=cache_context.display_url, - success=bool(html), - timing=t2 - t1, - tag="FETCH" - ) + # Set response data + if async_response: + crawl_result.status_code = async_response.status_code + crawl_result.response_headers = async_response.response_headers + crawl_result.downloaded_files = async_response.downloaded_files + else: + crawl_result.status_code = 200 + crawl_result.response_headers = cached_result.response_headers if cached_result else {} + + crawl_result.success = bool(html) + crawl_result.session_id = kwargs.get("session_id", None) + # if verbose: - # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") + # print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") + self.logger.success( + message="{url:.50}... | Status: {status} | Total: {timing}", + tag="COMPLETE", + params={ + "url": cache_context.display_url, + "status": crawl_result.success, + "timing": f"{time.perf_counter() - start_time:.2f}s" + }, + colors={ + "status": Fore.GREEN if crawl_result.success else Fore.RED, + "timing": Fore.YELLOW + } + ) - # Process the HTML content - crawl_result = await self.aprocess_html( - url=url, - html=html, - extracted_content=extracted_content, - word_count_threshold=word_count_threshold, - extraction_strategy=extraction_strategy, - chunking_strategy=chunking_strategy, - content_filter=content_filter, - css_selector=css_selector, - screenshot=screenshot_data, - verbose=verbose, - is_cached=bool(cached_result), - async_response=async_response, - is_web_url=cache_context.is_web_url, - is_local_file=cache_context.is_local_file, - is_raw_html=cache_context.is_raw_html, - **kwargs, - ) + # Update cache if appropriate + if cache_context.should_write() and not bool(cached_result): + await async_db_manager.acache_url(crawl_result) + + return crawl_result - # Set response data - if async_response: - crawl_result.status_code = async_response.status_code - crawl_result.response_headers = async_response.response_headers - crawl_result.downloaded_files = async_response.downloaded_files - else: - crawl_result.status_code = 200 - crawl_result.response_headers = cached_result.response_headers if cached_result else {} - - crawl_result.success = bool(html) - crawl_result.session_id = kwargs.get("session_id", None) - - # if verbose: - # print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") - self.logger.success( - message="{url:.50}... | Status: {status} | Total: {timing}", - tag="COMPLETE", - params={ - "url": cache_context.display_url, - "status": crawl_result.success, - "timing": f"{time.perf_counter() - start_time:.2f}s" - }, - colors={ - "status": Fore.GREEN if crawl_result.success else Fore.RED, - "timing": Fore.YELLOW - } + except Exception as e: + if not hasattr(e, "msg"): + e.msg = str(e) + # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") + + self.logger.error_status( + url=cache_context.display_url, + error=create_box_message(e.msg, type = "error"), + tag="ERROR" + ) + return CrawlResult( + url=url, + html="", + success=False, + error_message=e.msg ) - # Update cache if appropriate - if cache_context.should_write() and not bool(cached_result): - await async_db_manager.acache_url(crawl_result) - - return crawl_result - - except Exception as e: - if not hasattr(e, "msg"): - e.msg = str(e) - # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") - - self.logger.error_status( - url=cache_context.display_url, - error=create_box_message(e.msg, type = "error"), - tag="ERROR" - ) - return CrawlResult( - url=url, - html="", - success=False, - error_message=e.msg - ) - async def arun_many( self, urls: List[str], From b02544bc0bf1dac897adec6bb0de730e5b7f3ccd Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 3 Dec 2024 21:28:52 +0800 Subject: [PATCH 114/115] docs: update README and blog for version 0.4.0 release, highlighting new features and improvements --- README.md | 22 +++++++---- docs/md_v2/blog/index.md | 28 ++++++++++++++ docs/md_v2/blog/releases/0.4.0.md | 62 +++++++++++++++++++++++++++++++ mkdocs.yml | 14 ++++--- 4 files changed, 113 insertions(+), 13 deletions(-) create mode 100644 docs/md_v2/blog/index.md create mode 100644 docs/md_v2/blog/releases/0.4.0.md diff --git a/README.md b/README.md index d70af8ad..cbeb4067 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,10 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -[✨ Check out latest update v0.3.745](#-recent-updates) + +🎉 **Version 0.4.0 is out!** Introducing our experimental PruningContentFilter - a powerful new algorithm for smarter Markdown generation. Test it out and [share your feedback](https://github.com/unclecode/crawl4ai/issues)! [Read the release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/blog/releases/0.4.0.md) + +[✨ Check out latest update v0.4.0](#-recent-updates) ## 🧐 Why Crawl4AI? @@ -623,18 +626,21 @@ async def test_news_crawl(): ## ✨ Recent Updates -- 🚀 **Improved ManagedBrowser Configuration**: Dynamic host and port support for more flexible browser management. -- 📝 **Enhanced Markdown Generation**: New generator class for better formatting and customization. -- ⚡ **Fast HTML Formatting**: Significantly optimized HTML formatting in the web crawler. -- 🛠️ **Utility & Sanitization Upgrades**: Improved sanitization and expanded utility functions for streamlined workflows. -- 👥 **Acknowledgments**: Added contributor details and pull request acknowledgments for better transparency. +- 🔬 **PruningContentFilter**: New unsupervised filtering strategy for intelligent content extraction based on text density and relevance scoring. +- 🧵 **Enhanced Thread Safety**: Improved multi-threaded environment handling with better locks and parallel processing support. +- 🤖 **Smart User-Agent Generation**: Advanced user-agent generator with customization options and randomization capabilities. +- 📝 **New Blog Launch**: Stay updated with our detailed release notes and technical deep dives at [crawl4ai.com/blog](https://crawl4ai.com/blog). +- 🧪 **Expanded Test Coverage**: Comprehensive test suite for both PruningContentFilter and BM25ContentFilter with edge case handling. +Read the full details of this release in our [0.4.0 Release Notes](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/blog/releases/0.4.0.md). ## 📖 Documentation & Roadmap -For detailed documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/). +> 🚨 **Documentation Update Alert**: We're undertaking a major documentation overhaul next week to reflect recent updates and improvements. Stay tuned for a more comprehensive and up-to-date guide! -Moreover to check our development plans and upcoming features, check out our [Roadmap](https://github.com/unclecode/crawl4ai/blob/main/ROADMAP.md). +For current documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/). + +To check our development plans and upcoming features, visit our [Roadmap](https://github.com/unclecode/crawl4ai/blob/main/ROADMAP.md).
    📈 Development TODOs diff --git a/docs/md_v2/blog/index.md b/docs/md_v2/blog/index.md new file mode 100644 index 00000000..054b12f8 --- /dev/null +++ b/docs/md_v2/blog/index.md @@ -0,0 +1,28 @@ +# Crawl4AI Blog + +Welcome to the Crawl4AI blog! Here you'll find detailed release notes, technical deep dives, and news about the project. + +## Latest Release + +### [0.4.0 - Major Content Filtering Update](releases/0.4.0.md) +*December 1, 2024* + +Introducing significant improvements to content filtering, multi-threaded environment handling, and user-agent generation. This release features the new PruningContentFilter, enhanced thread safety, and improved test coverage. + +[Read full release notes →](releases/0.4.0.md) + +## Project History + +Want to see how we got here? Check out our [complete changelog](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md) covering all previous versions and the evolution of Crawl4AI. + +## Categories + +- [Technical Deep Dives](/blog/technical) - Coming soon +- [Tutorials & Guides](/blog/tutorials) - Coming soon +- [Community Updates](/blog/community) - Coming soon + +## Stay Updated + +- Star us on [GitHub](https://github.com/unclecode/crawl4ai) +- Follow [@unclecode](https://twitter.com/unclecode) on Twitter +- Join our community discussions on GitHub diff --git a/docs/md_v2/blog/releases/0.4.0.md b/docs/md_v2/blog/releases/0.4.0.md new file mode 100644 index 00000000..0e7ee5df --- /dev/null +++ b/docs/md_v2/blog/releases/0.4.0.md @@ -0,0 +1,62 @@ +# Release Summary for Version 0.4.0 (December 1, 2024) + +## Overview +The 0.4.0 release introduces significant improvements to content filtering, multi-threaded environment handling, user-agent generation, and test coverage. Key highlights include the introduction of the PruningContentFilter, designed to automatically identify and extract the most valuable parts of an HTML document, as well as enhancements to the BM25ContentFilter to extend its versatility and effectiveness. + +## Major Features and Enhancements + +### 1. PruningContentFilter +- Introduced a new unsupervised content filtering strategy that scores and prunes less relevant nodes in an HTML document based on metrics like text and link density. +- Focuses on retaining the most valuable parts of the content, making it highly effective for extracting relevant information from complex web pages. +- Fully documented with updated README and expanded user guides. + +### 2. User-Agent Generator +- Added a user-agent generator utility that resolves compatibility issues and supports customizable user-agent strings. +- By default, the generator randomizes user agents for each request, adding diversity, but users can customize it for tailored scenarios. + +### 3. Enhanced Thread Safety +- Improved handling of multi-threaded environments by adding better thread locks for parallel processing, ensuring consistency and stability when running multiple threads. + +### 4. Extended Content Filtering Strategies +- Users now have access to both the PruningContentFilter for unsupervised extraction and the BM25ContentFilter for supervised filtering based on user queries. +- Enhanced BM25ContentFilter with improved capabilities to process page titles, meta tags, and descriptions, allowing for more effective classification and clustering of text chunks. + +### 5. Documentation Updates +- Updated examples and tutorials to promote the use of the PruningContentFilter alongside the BM25ContentFilter, providing clear instructions for selecting the appropriate filter for each use case. + +### 6. Unit Test Enhancements +- Added unit tests for PruningContentFilter to ensure accuracy and reliability. +- Enhanced BM25ContentFilter tests to cover additional edge cases and performance metrics, particularly for malformed HTML inputs. + +## Revised Change Logs for Version 0.4.0 + +### PruningContentFilter (Dec 01, 2024) +- Introduced the PruningContentFilter to optimize content extraction by pruning less relevant HTML nodes. + - **Affected Files:** + - **crawl4ai/content_filter_strategy.py**: Added a scoring-based pruning algorithm. + - **README.md**: Updated to include PruningContentFilter usage. + - **docs/md_v2/basic/content_filtering.md**: Expanded user documentation, detailing the use and benefits of PruningContentFilter. + +### Unit Tests for PruningContentFilter (Dec 01, 2024) +- Added comprehensive unit tests for PruningContentFilter to ensure correctness and efficiency. + - **Affected Files:** + - **tests/async/test_content_filter_prune.py**: Created tests covering different pruning scenarios to ensure stability and correctness. + +### Enhanced BM25ContentFilter Tests (Dec 01, 2024) +- Expanded tests to cover additional extraction scenarios and performance metrics, improving robustness. + - **Affected Files:** + - **tests/async/test_content_filter_bm25.py**: Added tests for edge cases, including malformed HTML inputs. + +### Documentation and Example Updates (Dec 01, 2024) +- Revised examples to illustrate the use of PruningContentFilter alongside existing content filtering methods. + - **Affected Files:** + - **docs/examples/quickstart_async.py**: Enhanced example clarity and usability for new users. + +## Experimental Features +- The PruningContentFilter is still under experimental development, and we continue to gather feedback for further refinements. + +## Conclusion +This release significantly enhances the content extraction capabilities of Crawl4ai with the introduction of the PruningContentFilter, improved supervised filtering with BM25ContentFilter, and robust multi-threaded handling. Additionally, the user-agent generator provides much-needed versatility, resolving compatibility issues faced by many users. + +Users are encouraged to experiment with the new content filtering methods to determine which best suits their needs. + diff --git a/mkdocs.yml b/mkdocs.yml index 1b26b9df..4ba7c2a7 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -10,7 +10,11 @@ nav: - 'Installation': 'basic/installation.md' - 'Docker Deplotment': 'basic/docker-deploymeny.md' - 'Quick Start': 'basic/quickstart.md' - + - Changelog & Blog: + - 'Blog Home': 'blog/index.md' + - 'Latest (0.4.0)': 'blog/releases/0.4.0.md' + - 'Changelog': 'https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md' + - Basic: - 'Simple Crawling': 'basic/simple-crawling.md' - 'Output Formats': 'basic/output-formats.md' @@ -50,12 +54,12 @@ nav: - '5. Dynamic Content': 'tutorial/episode_05_JavaScript_Execution_and_Dynamic_Content_Handling.md' - '6. Magic Mode': 'tutorial/episode_06_Magic_Mode_and_Anti-Bot_Protection.md' - '7. Content Cleaning': 'tutorial/episode_07_Content_Cleaning_and_Fit_Markdown.md' - - '8. Media Handling': 'tutorial/episode_08_Media_Handling:_Images,_Videos,_and_Audio.md' + - '8. Media Handling': 'tutorial/episode_08_Media_Handling_Images_Videos_and_Audio.md' - '9. Link Analysis': 'tutorial/episode_09_Link_Analysis_and_Smart_Filtering.md' - '10. User Simulation': 'tutorial/episode_10_Custom_Headers,_Identity,_and_User_Simulation.md' - - '11.1. JSON CSS': 'tutorial/episode_11_1_Extraction_Strategies:_JSON_CSS.md' - - '11.2. LLM Strategy': 'tutorial/episode_11_2_Extraction_Strategies:_LLM.md' - - '11.3. Cosine Strategy': 'tutorial/episode_11_3_Extraction_Strategies:_Cosine.md' + - '11.1. JSON CSS': 'tutorial/episode_11_1_Extraction_Strategies_JSON_CSS.md' + - '11.2. LLM Strategy': 'tutorial/episode_11_2_Extraction_Strategies_LLM.md' + - '11.3. Cosine Strategy': 'tutorial/episode_11_3_Extraction_Strategies_Cosine.md' - '12. Session Crawling': 'tutorial/episode_12_Session-Based_Crawling_for_Dynamic_Websites.md' - '13. Text Chunking': 'tutorial/episode_13_Chunking_Strategies_for_Large_Text_Processing.md' - '14. Custom Workflows': 'tutorial/episode_14_Hooks_and_Custom_Workflow_with_AsyncWebCrawler.md' From 486db3a7713e6ffb22dc378c989b67bdc57fff74 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 4 Dec 2024 20:26:39 +0800 Subject: [PATCH 115/115] Updated to version 0.4.0 with new features - Enhanced error handling in async crawler. - Added flexible options in Markdown generation. - Updated user agent settings for improved reliability. - Reflected changes in documentation and examples. --- crawl4ai/__version__.py | 2 +- crawl4ai/async_crawler_strategy.py | 19 ++++++++- crawl4ai/markdown_generation_strategy.py | 12 ++++-- crawl4ai/user_agent_generator.py | 1 + docs/examples/quickstart_async.py | 51 +++++++++++++++++++----- 5 files changed, 69 insertions(+), 16 deletions(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 189a2955..6f8b06f4 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.747" +__version__ = "0.4.0" diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 3d24bd84..493597ea 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -6,6 +6,7 @@ from typing import Callable, Dict, Any, List, Optional, Awaitable import os, sys, shutil import tempfile, subprocess from playwright.async_api import async_playwright, Page, Browser, Error +from playwright.async_api import TimeoutError as PlaywrightTimeoutError from io import BytesIO from PIL import Image, ImageDraw, ImageFont from pathlib import Path @@ -223,6 +224,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.use_cached_html = use_cached_html self.user_agent = kwargs.get( "user_agent", + # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36" ) user_agenr_generator = UserAgentGenerator() @@ -941,11 +943,24 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): }); } """ + try: - await page.wait_for_load_state() + try: + await page.wait_for_load_state( + # state="load", + state="domcontentloaded", + timeout=5 + ) + except PlaywrightTimeoutError: + pass await page.evaluate(update_image_dimensions_js) except Exception as e: - raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}") + self.logger.error( + message="Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {error}", + tag="ERROR", + params={"error": str(e)} + ) + # raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}") # Wait a bit for any onload events to complete await page.wait_for_timeout(100) diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index f242054d..1e0ca664 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -11,8 +11,9 @@ LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)') class MarkdownGenerationStrategy(ABC): """Abstract base class for markdown generation strategies.""" - def __init__(self, content_filter: Optional[RelevantContentFilter] = None): + def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None): self.content_filter = content_filter + self.options = options or {} @abstractmethod def generate_markdown(self, @@ -27,8 +28,8 @@ class MarkdownGenerationStrategy(ABC): class DefaultMarkdownGenerator(MarkdownGenerationStrategy): """Default implementation of markdown generation strategy.""" - def __init__(self, content_filter: Optional[RelevantContentFilter] = None): - super().__init__(content_filter) + def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None): + super().__init__(content_filter, options) def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]: link_map = {} @@ -74,6 +75,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy): cleaned_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, + options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult: @@ -82,6 +84,10 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy): h = CustomHTML2Text() if html2text_options: h.update_params(**html2text_options) + elif options: + h.update_params(**options) + elif self.options: + h.update_params(**self.options) # Generate raw markdown raw_markdown = h.handle(cleaned_html) diff --git a/crawl4ai/user_agent_generator.py b/crawl4ai/user_agent_generator.py index 0a4df0bb..a1f3a49e 100644 --- a/crawl4ai/user_agent_generator.py +++ b/crawl4ai/user_agent_generator.py @@ -236,6 +236,7 @@ class UserAgentGenerator: # Example usage: if __name__ == "__main__": generator = UserAgentGenerator() + print(generator.generate()) print("\nSingle browser (Chrome):") print(generator.generate(num_browsers=1, browser_type='chrome')) diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 73d695c3..176b0ba7 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -547,19 +547,50 @@ async def generate_knowledge_graph(): f.write(result.extracted_content) async def fit_markdown_remove_overlay(): - async with AsyncWebCrawler(headless = False) as crawler: - url = "https://janineintheworld.com/places-to-visit-in-central-mexico" + async with AsyncWebCrawler( + headless=True, # Set to False to see what is happening + verbose=True, + user_agent_mode="random", + user_agent_generator_config={ + "device_type": "mobile", + "os_type": "android" + }, + ) as crawler: result = await crawler.arun( - url=url, + url='https://www.kidocode.com/degrees/technology', cache_mode=CacheMode.BYPASS, - word_count_threshold = 10, - remove_overlay_elements=True, - screenshot = True + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0), + options={ + "ignore_links": True + } + ), + # markdown_generator=DefaultMarkdownGenerator( + # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0), + # options={ + # "ignore_links": True + # } + # ), ) - # Save markdown to file - with open(os.path.join(__location__, "mexico_places.md"), "w") as f: - f.write(result.fit_markdown) - + + if result.success: + print(len(result.markdown_v2.raw_markdown)) + print(len(result.markdown_v2.markdown_with_citations)) + print(len(result.markdown_v2.fit_markdown)) + + # Save clean html + with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f: + f.write(result.cleaned_html) + + with open(os.path.join(__location__, "output/output_raw_markdown.md"), "w") as f: + f.write(result.markdown_v2.raw_markdown) + + with open(os.path.join(__location__, "output/output_markdown_with_citations.md"), "w") as f: + f.write(result.markdown_v2.markdown_with_citations) + + with open(os.path.join(__location__, "output/output_fit_markdown.md"), "w") as f: + f.write(result.markdown_v2.fit_markdown) + print("Done")