From d729aa7d5edf9dab069af06e0c4ade1ca997eef7 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 23 Nov 2024 18:00:32 +0800 Subject: [PATCH 01/70] refactor: Add group ID to for images extracted from srcset. --- crawl4ai/content_scraping_strategy.py | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index d4b901d2..70a43240 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -228,24 +228,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): return None def process_image_old(img, url, index, total_images): - def parse_srcset(srcset_str): - """Parse srcset attribute into list of image URLs with their sizes.""" - if not srcset_str: - return [] - - sources = [] - # Split on http/https and filter empty strings - urls = [f"http{part}" for part in srcset_str.split("http") if part] - - for url in urls: - # Remove trailing comma and whitespace, then split to get width - url = url.strip().rstrip(',') - parts = url.rsplit(' ', 1) - img_url = parts[0].strip() - width = parts[1].rstrip('w') if len(parts) > 1 else None - sources.append({'url': img_url, 'width': width}) - - return sources + #Check if an image has valid display and inside undesired html elements def is_valid_image(img, parent, parent_classes): @@ -376,12 +359,16 @@ class WebScrapingStrategy(ContentScrapingStrategy): unique_urls = set() image_variants = [] + # Generate a unique group ID for this set of variants + group_id = index + # Base image info template base_info = { 'alt': alt, 'desc': find_closest_parent_with_useful_text(img), 'score': score, - 'type': 'image' + 'type': 'image', + 'group_id': group_id # Group ID for this set of variants } # Inline function for adding variants From 829a1f7992703064084826e0ebfeed819988c6e7 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 23 Nov 2024 19:45:41 +0800 Subject: [PATCH 02/70] feat: update version to 0.3.741 and enhance content filtering with heuristic strategy. Fixing the issue that when the past HTML to BM25 content filter does not have any HTML elements. --- crawl4ai/__version__.py | 2 +- crawl4ai/content_filter_strategy.py | 188 +++++++++++++++++++++++++- crawl4ai/content_scraping_strategy.py | 8 +- 3 files changed, 189 insertions(+), 9 deletions(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 65ee6e73..05bfd336 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.74" \ No newline at end of file +__version__ = "0.3.741" \ No newline at end of file diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py index 88216f7f..e6891a3f 100644 --- a/crawl4ai/content_filter_strategy.py +++ b/crawl4ai/content_filter_strategy.py @@ -10,6 +10,13 @@ from abc import ABC, abstractmethod from snowballstemmer import stemmer + +# import regex +# def tokenize_text(text): +# # Regular expression to match words or CJK (Chinese, Japanese, Korean) characters +# pattern = r'\p{L}+|\p{N}+|[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}ー]|[\p{P}]' +# return regex.findall(pattern, text) + # from nltk.stem import PorterStemmer # ps = PorterStemmer() class RelevantContentFilter(ABC): @@ -57,9 +64,14 @@ class RelevantContentFilter(ABC): query_parts = [] # Title - if soup.title: - query_parts.append(soup.title.string) - elif soup.find('h1'): + try: + title = soup.title.string + if title: + query_parts.append(title) + except Exception: + pass + + if soup.find('h1'): query_parts.append(soup.find('h1').get_text()) # Meta tags @@ -81,7 +93,7 @@ class RelevantContentFilter(ABC): return ' '.join(filter(None, query_parts)) - def extract_text_chunks(self, body: Tag) -> List[Tuple[str, str]]: + def extract_text_chunks(self, body: Tag, min_word_threshold: int = None) -> List[Tuple[str, str]]: """ Extracts text chunks from a BeautifulSoup body element while preserving order. Returns list of tuples (text, tag_name) for classification. @@ -155,6 +167,9 @@ class RelevantContentFilter(ABC): if text: chunks.append((chunk_index, text, 'content', body)) + if min_word_threshold: + chunks = [chunk for chunk in chunks if len(chunk[1].split()) >= min_word_threshold] + return chunks @@ -274,15 +289,26 @@ class BM25ContentFilter(RelevantContentFilter): } self.stemmer = stemmer(language) - def filter_content(self, html: str) -> List[str]: + def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]: """Implements content filtering using BM25 algorithm with priority tag handling""" if not html or not isinstance(html, str): return [] soup = BeautifulSoup(html, 'lxml') + + # Check if body is present + if not soup.body: + # Wrap in body tag if missing + soup = BeautifulSoup(f'{html}', 'lxml') body = soup.find('body') - query = self.extract_page_query(soup.find('head'), body) - candidates = self.extract_text_chunks(body) + + query = self.extract_page_query(soup, body) + + if not query: + return [] + # return [self.clean_element(soup)] + + candidates = self.extract_text_chunks(body, min_word_threshold) if not candidates: return [] @@ -299,6 +325,10 @@ class BM25ContentFilter(RelevantContentFilter): for _, chunk, _, _ in candidates] tokenized_query = [self.stemmer.stemWord(word) for word in query.lower().split()] + # tokenized_corpus = [[self.stemmer.stemWord(word) for word in tokenize_text(chunk.lower())] + # for _, chunk, _, _ in candidates] + # tokenized_query = [self.stemmer.stemWord(word) for word in tokenize_text(query.lower())] + # Clean from stop words and noise tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus] tokenized_query = clean_tokens(tokenized_query) @@ -326,3 +356,147 @@ class BM25ContentFilter(RelevantContentFilter): selected_candidates.sort(key=lambda x: x[0]) return [self.clean_element(tag) for _, _, tag in selected_candidates] + + +class HeuristicContentFilter(RelevantContentFilter): + def __init__(self): + super().__init__() + # Weights for different heuristics + self.tag_weights = { + 'article': 10, + 'main': 8, + 'section': 5, + 'div': 3, + 'p': 2, + 'pre': 2, + 'code': 2, + 'blockquote': 2, + 'li': 1, + 'span': 1, + } + self.max_depth = 5 # Maximum depth from body to consider + + def filter_content(self, html: str) -> List[str]: + """Implements heuristic content filtering without relying on a query.""" + if not html or not isinstance(html, str): + return [] + + soup = BeautifulSoup(html, 'lxml') + + # Ensure there is a body tag + if not soup.body: + soup = BeautifulSoup(f'{html}', 'lxml') + body = soup.body + + # Extract candidate text chunks + candidates = self.extract_text_chunks(body) + + if not candidates: + return [] + + # Score each candidate + scored_candidates = [] + for index, text, tag_type, tag in candidates: + score = self.score_element(tag, text) + if score > 0: + scored_candidates.append((score, index, text, tag)) + + # Sort candidates by score and then by document order + scored_candidates.sort(key=lambda x: (-x[0], x[1])) + + # Extract the top candidates (e.g., top 5) + top_candidates = scored_candidates[:5] # Adjust the number as needed + + # Sort the top candidates back to their original document order + top_candidates.sort(key=lambda x: x[1]) + + # Clean and return the content + return [self.clean_element(tag) for _, _, _, tag in top_candidates] + + def score_element(self, tag: Tag, text: str) -> float: + """Compute a score for an element based on heuristics.""" + if not text or not tag: + return 0 + + # Exclude unwanted tags + if self.is_excluded(tag): + return 0 + + # Text density + text_length = len(text.strip()) + html_length = len(str(tag)) + text_density = text_length / html_length if html_length > 0 else 0 + + # Link density + link_text_length = sum(len(a.get_text().strip()) for a in tag.find_all('a')) + link_density = link_text_length / text_length if text_length > 0 else 0 + + # Tag weight + tag_weight = self.tag_weights.get(tag.name, 1) + + # Depth factor (prefer elements closer to the body tag) + depth = self.get_depth(tag) + depth_weight = max(self.max_depth - depth, 1) / self.max_depth + + # Compute the final score + score = (text_density * tag_weight * depth_weight) / (1 + link_density) + + return score + + def get_depth(self, tag: Tag) -> int: + """Compute the depth of the tag from the body tag.""" + depth = 0 + current = tag + while current and current != current.parent and current.name != 'body': + current = current.parent + depth += 1 + return depth + + def extract_text_chunks(self, body: Tag) -> List[Tuple[int, str, str, Tag]]: + """ + Extracts text chunks from the body element while preserving order. + Returns list of tuples (index, text, tag_type, tag) for scoring. + """ + chunks = [] + index = 0 + + def traverse(element): + nonlocal index + if isinstance(element, NavigableString): + return + if not isinstance(element, Tag): + return + if self.is_excluded(element): + return + # Only consider included tags + if element.name in self.included_tags: + text = element.get_text(separator=' ', strip=True) + if len(text.split()) >= self.min_word_count: + tag_type = 'header' if element.name in self.header_tags else 'content' + chunks.append((index, text, tag_type, element)) + index += 1 + # Do not traverse children of this element to prevent duplication + return + for child in element.children: + traverse(child) + + traverse(body) + return chunks + + def is_excluded(self, tag: Tag) -> bool: + """Determine if a tag should be excluded based on heuristics.""" + if tag.name in self.excluded_tags: + return True + class_id = ' '.join(filter(None, [ + ' '.join(tag.get('class', [])), + tag.get('id', '') + ])) + if self.negative_patterns.search(class_id): + return True + # Exclude tags with high link density (e.g., navigation menus) + text = tag.get_text(separator=' ', strip=True) + link_text_length = sum(len(a.get_text(strip=True)) for a in tag.find_all('a')) + text_length = len(text) + if text_length > 0 and (link_text_length / text_length) > 0.5: + return True + return False diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 70a43240..ea6a2ef8 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -9,7 +9,7 @@ from bs4 import element, NavigableString, Comment from urllib.parse import urljoin from requests.exceptions import InvalidSchema # from .content_cleaning_strategy import ContentCleaningStrategy -from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter +from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerationStrategy from .models import MarkdownGenerationResult from .utils import ( @@ -129,6 +129,12 @@ class WebScrapingStrategy(ContentScrapingStrategy): params={"error": str(e)} ) markdown_generator = None + return { + 'markdown': f"Error using new markdown generation strategy: {str(e)}", + 'fit_markdown': "Set flag 'fit_markdown' to True to get cleaned HTML content.", + 'fit_html': "Set flag 'fit_markdown' to True to get cleaned HTML content.", + 'markdown_v2': None + } # Legacy method h = CustomHTML2Text() From edad7b6a742249f324d3baba01095f93fc05912f Mon Sep 17 00:00:00 2001 From: unclecode Date: Sun, 24 Nov 2024 18:48:39 +0800 Subject: [PATCH 03/70] chore: remove Railway deployment configuration and related documentation --- deploy/railway/README.md | 19 - deploy/railway/button.json | 33 -- deploy/railway/railway.toml | 18 - pages/app.css | 131 ----- pages/app.js | 356 ------------ pages/index copy.html | 971 -------------------------------- pages/index.html | 73 --- pages/index_pooling.html | 425 -------------- pages/partial/footer.html | 36 -- pages/partial/how_to_guide.html | 174 ------ pages/partial/installation.html | 65 --- pages/partial/try_it.html | 217 ------- pages/tmp.html | 434 -------------- 13 files changed, 2952 deletions(-) delete mode 100644 deploy/railway/README.md delete mode 100644 deploy/railway/button.json delete mode 100644 deploy/railway/railway.toml delete mode 100644 pages/app.css delete mode 100644 pages/app.js delete mode 100644 pages/index copy.html delete mode 100644 pages/index.html delete mode 100644 pages/index_pooling.html delete mode 100644 pages/partial/footer.html delete mode 100644 pages/partial/how_to_guide.html delete mode 100644 pages/partial/installation.html delete mode 100644 pages/partial/try_it.html delete mode 100644 pages/tmp.html diff --git a/deploy/railway/README.md b/deploy/railway/README.md deleted file mode 100644 index 155e7642..00000000 --- a/deploy/railway/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# Railway Deployment - -## Quick Deploy -[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/template/crawl4ai) - -## Manual Setup -1. Fork this repository -2. Create a new Railway project -3. Configure environment variables: - - `INSTALL_TYPE`: basic or all - - `ENABLE_GPU`: true/false -4. Deploy! - -## Configuration -See `railway.toml` for: -- Memory limits -- Health checks -- Restart policies -- Scaling options \ No newline at end of file diff --git a/deploy/railway/button.json b/deploy/railway/button.json deleted file mode 100644 index 1fc52167..00000000 --- a/deploy/railway/button.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "name": "Crawl4AI", - "description": "LLM Friendly Web Crawler & Scraper", - "render": { - "dockerfile": { - "path": "Dockerfile" - } - }, - "env": [ - { - "key": "INSTALL_TYPE", - "description": "Installation type (basic/all)", - "default": "basic", - "required": true - }, - { - "key": "ENABLE_GPU", - "description": "Enable GPU support", - "default": "false", - "required": false - } - ], - "services": [ - { - "name": "web", - "dockerfile": "./Dockerfile", - "healthcheck": { - "path": "/health", - "port": 11235 - } - } - ] - } \ No newline at end of file diff --git a/deploy/railway/railway.toml b/deploy/railway/railway.toml deleted file mode 100644 index f24d8fab..00000000 --- a/deploy/railway/railway.toml +++ /dev/null @@ -1,18 +0,0 @@ -# railway.toml -[build] -builder = "DOCKERFILE" -dockerfilePath = "Dockerfile" - -[deploy] -startCommand = "uvicorn main:app --host 0.0.0.0 --port $PORT" -healthcheckPath = "/health" -restartPolicyType = "ON_FAILURE" -restartPolicyMaxRetries = 3 - -[deploy.memory] -soft = 2048 # 2GB min for Playwright -hard = 4096 # 4GB max - -[deploy.scaling] -min = 1 -max = 1 diff --git a/pages/app.css b/pages/app.css deleted file mode 100644 index 0e94a2e5..00000000 --- a/pages/app.css +++ /dev/null @@ -1,131 +0,0 @@ -:root { - --ifm-font-size-base: 100%; - --ifm-line-height-base: 1.65; - --ifm-font-family-base: system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans, sans-serif, - BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", - "Segoe UI Symbol"; -} -html { - -webkit-font-smoothing: antialiased; - -webkit-text-size-adjust: 100%; - text-size-adjust: 100%; - font: var(--ifm-font-size-base) / var(--ifm-line-height-base) var(--ifm-font-family-base); -} -body { - background-color: #1a202c; - color: #fff; -} -.tab-content { - max-height: 400px; - overflow: auto; -} -pre { - white-space: pre-wrap; - font-size: 14px; -} -pre code { - width: 100%; -} - -/* Custom styling for docs-item class and Markdown generated elements */ -.docs-item { - background-color: #2d3748; /* bg-gray-800 */ - padding: 1rem; /* p-4 */ - border-radius: 0.375rem; /* rounded */ - box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); /* shadow-md */ - margin-bottom: 1rem; /* space between items */ - line-height: 1.5; /* leading-normal */ -} - -.docs-item h3, -.docs-item h4 { - color: #ffffff; /* text-white */ - font-size: 1.25rem; /* text-xl */ - font-weight: 700; /* font-bold */ - margin-bottom: 0.5rem; /* mb-2 */ -} -.docs-item h4 { - font-size: 1rem; /* text-xl */ -} - -.docs-item p { - color: #e2e8f0; /* text-gray-300 */ - margin-bottom: 0.5rem; /* mb-2 */ -} - -.docs-item code { - background-color: #1a202c; /* bg-gray-900 */ - color: #e2e8f0; /* text-gray-300 */ - padding: 0.25rem 0.5rem; /* px-2 py-1 */ - border-radius: 0.25rem; /* rounded */ - font-size: 0.875rem; /* text-sm */ -} - -.docs-item pre { - background-color: #1a202c; /* bg-gray-900 */ - color: #e2e8f0; /* text-gray-300 */ - padding: 0.5rem; /* p-2 */ - border-radius: 0.375rem; /* rounded */ - overflow: auto; /* overflow-auto */ - margin-bottom: 0.5rem; /* mb-2 */ -} - -.docs-item div { - color: #e2e8f0; /* text-gray-300 */ - font-size: 1rem; /* prose prose-sm */ - line-height: 1.25rem; /* line-height for readability */ -} - -/* Adjustments to make prose class more suitable for dark mode */ -.prose { - max-width: none; /* max-w-none */ -} - -.prose p, -.prose ul { - margin-bottom: 1rem; /* mb-4 */ -} - -.prose code { - /* background-color: #4a5568; */ /* bg-gray-700 */ - color: #65a30d; /* text-white */ - padding: 0.25rem 0.5rem; /* px-1 py-0.5 */ - border-radius: 0.25rem; /* rounded */ - display: inline-block; /* inline-block */ -} - -.prose pre { - background-color: #1a202c; /* bg-gray-900 */ - color: #ffffff; /* text-white */ - padding: 0.5rem; /* p-2 */ - border-radius: 0.375rem; /* rounded */ -} - -.prose h3 { - color: #65a30d; /* text-white */ - font-size: 1.25rem; /* text-xl */ - font-weight: 700; /* font-bold */ - margin-bottom: 0.5rem; /* mb-2 */ -} - -body { - background-color: #1a1a1a; - color: #b3ff00; -} -.sidebar { - color: #b3ff00; - border-right: 1px solid #333; -} -.sidebar a { - color: #b3ff00; - text-decoration: none; -} -.sidebar a:hover { - background-color: #555; -} -.content-section { - display: none; -} -.content-section.active { - display: block; -} diff --git a/pages/app.js b/pages/app.js deleted file mode 100644 index 098008ab..00000000 --- a/pages/app.js +++ /dev/null @@ -1,356 +0,0 @@ -// JavaScript to manage dynamic form changes and logic -document.getElementById("extraction-strategy-select").addEventListener("change", function () { - const strategy = this.value; - const providerModelSelect = document.getElementById("provider-model-select"); - const tokenInput = document.getElementById("token-input"); - const instruction = document.getElementById("instruction"); - const semantic_filter = document.getElementById("semantic_filter"); - const instruction_div = document.getElementById("instruction_div"); - const semantic_filter_div = document.getElementById("semantic_filter_div"); - const llm_settings = document.getElementById("llm_settings"); - - if (strategy === "LLMExtractionStrategy") { - // providerModelSelect.disabled = false; - // tokenInput.disabled = false; - // semantic_filter.disabled = true; - // instruction.disabled = false; - llm_settings.classList.remove("hidden"); - instruction_div.classList.remove("hidden"); - semantic_filter_div.classList.add("hidden"); - } else if (strategy === "NoExtractionStrategy") { - semantic_filter_div.classList.add("hidden"); - instruction_div.classList.add("hidden"); - llm_settings.classList.add("hidden"); - } else { - // providerModelSelect.disabled = true; - // tokenInput.disabled = true; - // semantic_filter.disabled = false; - // instruction.disabled = true; - llm_settings.classList.add("hidden"); - instruction_div.classList.add("hidden"); - semantic_filter_div.classList.remove("hidden"); - } - - -}); - -// Get the selected provider model and token from local storage -const storedProviderModel = localStorage.getItem("provider_model"); -const storedToken = localStorage.getItem(storedProviderModel); - -if (storedProviderModel) { - document.getElementById("provider-model-select").value = storedProviderModel; -} - -if (storedToken) { - document.getElementById("token-input").value = storedToken; -} - -// Handle provider model dropdown change -document.getElementById("provider-model-select").addEventListener("change", () => { - const selectedProviderModel = document.getElementById("provider-model-select").value; - const storedToken = localStorage.getItem(selectedProviderModel); - - if (storedToken) { - document.getElementById("token-input").value = storedToken; - } else { - document.getElementById("token-input").value = ""; - } -}); - -// Fetch total count from the database -axios - .get("/total-count") - .then((response) => { - document.getElementById("total-count").textContent = response.data.count; - }) - .catch((error) => console.error(error)); - -// Handle crawl button click -document.getElementById("crawl-btn").addEventListener("click", () => { - // validate input to have both URL and API token - // if selected extraction strategy is LLMExtractionStrategy, then API token is required - if (document.getElementById("extraction-strategy-select").value === "LLMExtractionStrategy") { - if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) { - alert("Please enter both URL(s) and API token."); - return; - } - } - - const selectedProviderModel = document.getElementById("provider-model-select").value; - const apiToken = document.getElementById("token-input").value; - const extractBlocks = document.getElementById("extract-blocks-checkbox").checked; - const bypassCache = document.getElementById("bypass-cache-checkbox").checked; - - // Save the selected provider model and token to local storage - localStorage.setItem("provider_model", selectedProviderModel); - localStorage.setItem(selectedProviderModel, apiToken); - - const urlsInput = document.getElementById("url-input").value; - const urls = urlsInput.split(",").map((url) => url.trim()); - const data = { - urls: urls, - include_raw_html: true, - bypass_cache: bypassCache, - extract_blocks: extractBlocks, - word_count_threshold: parseInt(document.getElementById("threshold").value), - extraction_strategy: document.getElementById("extraction-strategy-select").value, - extraction_strategy_args: { - provider: selectedProviderModel, - api_token: apiToken, - instruction: document.getElementById("instruction").value, - semantic_filter: document.getElementById("semantic_filter").value, - }, - chunking_strategy: document.getElementById("chunking-strategy-select").value, - chunking_strategy_args: {}, - css_selector: document.getElementById("css-selector").value, - screenshot: document.getElementById("screenshot-checkbox").checked, - // instruction: document.getElementById("instruction").value, - // semantic_filter: document.getElementById("semantic_filter").value, - verbose: true, - }; - - // import requests - - // data = { - // "urls": [ - // "https://www.nbcnews.com/business" - // ], - // "word_count_threshold": 10, - // "extraction_strategy": "NoExtractionStrategy", - // } - - // response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally - // print(response.json()) - - // save api token to local storage - localStorage.setItem("api_token", document.getElementById("token-input").value); - - document.getElementById("loading").classList.remove("hidden"); - document.getElementById("result").style.visibility = "hidden"; - document.getElementById("code_help").style.visibility = "hidden"; - - axios - .post("/crawl", data) - .then((response) => { - const result = response.data.results[0]; - const parsedJson = JSON.parse(result.extracted_content); - document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2); - document.getElementById("cleaned-html-result").textContent = result.cleaned_html; - document.getElementById("markdown-result").textContent = result.markdown; - document.getElementById("media-result").textContent = JSON.stringify( result.media, null, 2); - if (result.screenshot){ - const imgElement = document.createElement("img"); - // Set the src attribute with the base64 data - imgElement.src = `data:image/png;base64,${result.screenshot}`; - document.getElementById("screenshot-result").innerHTML = ""; - document.getElementById("screenshot-result").appendChild(imgElement); - } - - // Update code examples dynamically - const extractionStrategy = data.extraction_strategy; - const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy"; - - // REMOVE API TOKEN FROM CODE EXAMPLES - data.extraction_strategy_args.api_token = "your_api_token"; - - if (data.extraction_strategy === "NoExtractionStrategy") { - delete data.extraction_strategy_args; - delete data.extrac_blocks; - } - - if (data.chunking_strategy === "RegexChunking") { - delete data.chunking_strategy_args; - } - - delete data.verbose; - - if (data.css_selector === "") { - delete data.css_selector; - } - - if (!data.bypass_cache) { - delete data.bypass_cache; - } - - if (!data.extract_blocks) { - delete data.extract_blocks; - } - - if (!data.include_raw_html) { - delete data.include_raw_html; - } - - document.getElementById( - "curl-code" - ).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({ - ...data, - api_token: isLLMExtraction ? "your_api_token" : undefined, - }, null, 2)}' https://crawl4ai.com/crawl`; - - document.getElementById("python-code").textContent = `import requests\n\ndata = ${JSON.stringify( - { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined }, - null, - 2 - )}\n\nresponse = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally \nprint(response.json())`; - - document.getElementById( - "nodejs-code" - ).textContent = `const axios = require('axios');\n\nconst data = ${JSON.stringify( - { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined }, - null, - 2 - )};\n\naxios.post("https://crawl4ai.com/crawl", data) // OR local host if your run locally \n .then(response => console.log(response.data))\n .catch(error => console.error(error));`; - - document.getElementById( - "library-code" - ).textContent = `from crawl4ai.web_crawler import WebCrawler\nfrom crawl4ai.extraction_strategy import *\nfrom crawl4ai.chunking_strategy import *\n\ncrawler = WebCrawler()\ncrawler.warmup()\n\nresult = crawler.run(\n url='${ - urls[0] - }',\n word_count_threshold=${data.word_count_threshold},\n extraction_strategy=${ - isLLMExtraction - ? `${extractionStrategy}(provider="${data.provider_model}", api_token="${data.api_token}")` - : extractionStrategy + "()" - },\n chunking_strategy=${data.chunking_strategy}(),\n bypass_cache=${ - data.bypass_cache - },\n css_selector="${data.css_selector}"\n)\nprint(result)`; - - // Highlight code syntax - hljs.highlightAll(); - - // Select JSON tab by default - document.querySelector('.tab-btn[data-tab="json"]').click(); - - document.getElementById("loading").classList.add("hidden"); - - document.getElementById("result").style.visibility = "visible"; - document.getElementById("code_help").style.visibility = "visible"; - - // increment the total count - document.getElementById("total-count").textContent = - parseInt(document.getElementById("total-count").textContent) + 1; - }) - .catch((error) => { - console.error(error); - document.getElementById("loading").classList.add("hidden"); - }); -}); - -// Handle tab clicks -document.querySelectorAll(".tab-btn").forEach((btn) => { - btn.addEventListener("click", () => { - const tab = btn.dataset.tab; - document.querySelectorAll(".tab-btn").forEach((b) => b.classList.remove("bg-lime-700", "text-white")); - btn.classList.add("bg-lime-700", "text-white"); - document.querySelectorAll(".tab-content.code pre").forEach((el) => el.classList.add("hidden")); - document.getElementById(`${tab}-result`).parentElement.classList.remove("hidden"); - }); -}); - -// Handle code tab clicks -document.querySelectorAll(".code-tab-btn").forEach((btn) => { - btn.addEventListener("click", () => { - const tab = btn.dataset.tab; - document.querySelectorAll(".code-tab-btn").forEach((b) => b.classList.remove("bg-lime-700", "text-white")); - btn.classList.add("bg-lime-700", "text-white"); - document.querySelectorAll(".tab-content.result pre").forEach((el) => el.classList.add("hidden")); - document.getElementById(`${tab}-code`).parentElement.classList.remove("hidden"); - }); -}); - -// Handle copy to clipboard button clicks - -async function copyToClipboard(text) { - if (navigator.clipboard && navigator.clipboard.writeText) { - return navigator.clipboard.writeText(text); - } else { - return fallbackCopyTextToClipboard(text); - } -} - -function fallbackCopyTextToClipboard(text) { - return new Promise((resolve, reject) => { - const textArea = document.createElement("textarea"); - textArea.value = text; - - // Avoid scrolling to bottom - textArea.style.top = "0"; - textArea.style.left = "0"; - textArea.style.position = "fixed"; - - document.body.appendChild(textArea); - textArea.focus(); - textArea.select(); - - try { - const successful = document.execCommand("copy"); - if (successful) { - resolve(); - } else { - reject(); - } - } catch (err) { - reject(err); - } - - document.body.removeChild(textArea); - }); -} - -document.querySelectorAll(".copy-btn").forEach((btn) => { - btn.addEventListener("click", () => { - const target = btn.dataset.target; - const code = document.getElementById(target).textContent; - //navigator.clipboard.writeText(code).then(() => { - copyToClipboard(code).then(() => { - btn.textContent = "Copied!"; - setTimeout(() => { - btn.textContent = "Copy"; - }, 2000); - }); - }); -}); - -document.addEventListener("DOMContentLoaded", async () => { - try { - const extractionResponse = await fetch("/strategies/extraction"); - const extractionStrategies = await extractionResponse.json(); - - const chunkingResponse = await fetch("/strategies/chunking"); - const chunkingStrategies = await chunkingResponse.json(); - - renderStrategies("extraction-strategies", extractionStrategies); - renderStrategies("chunking-strategies", chunkingStrategies); - } catch (error) { - console.error("Error fetching strategies:", error); - } -}); - -function renderStrategies(containerId, strategies) { - const container = document.getElementById(containerId); - container.innerHTML = ""; // Clear any existing content - strategies = JSON.parse(strategies); - Object.entries(strategies).forEach(([strategy, description]) => { - const strategyElement = document.createElement("div"); - strategyElement.classList.add("bg-zinc-800", "p-4", "rounded", "shadow-md", "docs-item"); - - const strategyDescription = document.createElement("div"); - strategyDescription.classList.add("text-gray-300", "prose", "prose-sm"); - strategyDescription.innerHTML = marked.parse(description); - - strategyElement.appendChild(strategyDescription); - - container.appendChild(strategyElement); - }); -} -document.querySelectorAll(".sidebar a").forEach((link) => { - link.addEventListener("click", function (event) { - event.preventDefault(); - document.querySelectorAll(".content-section").forEach((section) => { - section.classList.remove("active"); - }); - const target = event.target.getAttribute("data-target"); - document.getElementById(target).classList.add("active"); - }); -}); -// Highlight code syntax -hljs.highlightAll(); diff --git a/pages/index copy.html b/pages/index copy.html deleted file mode 100644 index b61b7298..00000000 --- a/pages/index copy.html +++ /dev/null @@ -1,971 +0,0 @@ - - - - - - Crawl4AI - - - - - - - - - - - - - - - - -
-
-

🔥🕷️ Crawl4AI: Web Data for your Thoughts

-
-
- 📊 Total Website Processed - 2 -
-
- -
-
-

Try It Now

-
-
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
-
- - -
-
- - -
- -
-
- -
- -
- - - -
-
-
- - -
-
- -
-
- - - - -
-
-
-                                
-                                
-                            
- - - -
-
-
-
-
-
-
- -
- 🌟 Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! -
-
- First Step: Create an instance of WebCrawler and call the warmup() function. -
-
-
crawler = WebCrawler()
-            crawler.warmup()
-
- - -
- 🧠 Understanding 'bypass_cache' and 'include_raw_html' parameters: -
-
First crawl (caches the result):
-
-
result = crawler.run(url="https://www.nbcnews.com/business")
-
-
Second crawl (Force to crawl again):
-
-
result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
-
-
Crawl result without raw HTML content:
-
-
result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
-
- - -
- 📄 - The 'include_raw_html' parameter, when set to True, includes the raw HTML content in the - response. By default, it is set to True. -
-
Set always_by_pass_cache to True:
-
-
crawler.always_by_pass_cache = True
-
- - -
- 🧩 Let's add a chunking strategy: RegexChunking! -
-
Using RegexChunking:
-
-
result = crawler.run(
-                url="https://www.nbcnews.com/business",
-                chunking_strategy=RegexChunking(patterns=["\n\n"])
-            )
-
-
Using NlpSentenceChunking:
-
-
result = crawler.run(
-                url="https://www.nbcnews.com/business",
-                chunking_strategy=NlpSentenceChunking()
-            )
-
- - -
- 🧠 Let's get smarter with an extraction strategy: CosineStrategy! -
-
Using CosineStrategy:
-
-
result = crawler.run(
-                url="https://www.nbcnews.com/business",
-                extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3)
-            )
-
- - -
- 🤖 Time to bring in the big guns: LLMExtractionStrategy without instructions! -
-
Using LLMExtractionStrategy without instructions:
-
-
result = crawler.run(
-                url="https://www.nbcnews.com/business",
-                extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
-            )
-
- - -
- 📜 Let's make it even more interesting: LLMExtractionStrategy with instructions! -
-
Using LLMExtractionStrategy with instructions:
-
-
result = crawler.run(
-                url="https://www.nbcnews.com/business",
-                extraction_strategy=LLMExtractionStrategy(
-                    provider="openai/gpt-4o",
-                    api_token=os.getenv('OPENAI_API_KEY'),
-                    instruction="I am interested in only financial news"
-                )
-            )
-
- - -
- 🎯 Targeted extraction: Let's use a CSS selector to extract only H2 tags! -
-
Using CSS selector to extract H2 tags:
-
-
result = crawler.run(
-                url="https://www.nbcnews.com/business",
-                css_selector="h2"
-            )
-
- - -
- 🖱️ Let's get interactive: Passing JavaScript code to click 'Load More' button! -
-
Using JavaScript to click 'Load More' button:
-
-
js_code = """
-            const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
-            loadMoreButton && loadMoreButton.click();
-            """
-            crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-            crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
-            result = crawler.run(url="https://www.nbcnews.com/business")
-
- - -
- 🎉 - Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl - the web like a pro! 🕸️ -
-
-
-
-

Installation 💻

-

- There are two ways to use Crawl4AI: as a library in your Python projects or as a standalone local - server. -

- -

- You can also try Crawl4AI in a Google Colab - Open In Colab -

- -

Using Crawl4AI as a Library 📚

-

To install Crawl4AI as a library, follow these steps:

- -
    -
  1. - Install the package from GitHub: -
    pip install git+https://github.com/unclecode/crawl4ai.git
    -
  2. -
  3. - Alternatively, you can clone the repository and install the package locally: -
    virtualenv venv
    -source venv/bin/activate
    -git clone https://github.com/unclecode/crawl4ai.git
    -cd crawl4ai
    -pip install -e .
    -        
    -
  4. -
  5. - Import the necessary modules in your Python script: -
    from crawl4ai.web_crawler import WebCrawler
    -from crawl4ai.chunking_strategy import *
    -from crawl4ai.extraction_strategy import *
    -import os
    -
    -crawler = WebCrawler()
    -
    -# Single page crawl
    -single_url = UrlModel(url='https://www.nbcnews.com/business', forced=False)
    -result = crawl4ai.fetch_page(
    -    url='https://www.nbcnews.com/business',
    -    word_count_threshold=5, # Minimum word count for a HTML tag to be considered as a worthy block
    -    chunking_strategy= RegexChunking( patterns = ["\\n\\n"]), # Default is RegexChunking
    -    extraction_strategy= CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3) # Default is CosineStrategy
    -    # extraction_strategy= LLMExtractionStrategy(provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY')),
    -    bypass_cache=False,
    -    extract_blocks =True, # Whether to extract semantical blocks of text from the HTML
    -    css_selector = "", # Eg: "div.article-body"
    -    verbose=True,
    -    include_raw_html=True, # Whether to include the raw HTML content in the response
    -)
    -print(result.model_dump())
    -        
    -
  6. -
-

- For more information about how to run Crawl4AI as a local server, please refer to the - GitHub repository. -

- -
- -
-

📖 Parameters

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ParameterDescriptionRequiredDefault Value
urls - A list of URLs to crawl and extract data from. - Yes-
include_raw_html - Whether to include the raw HTML content in the response. - Nofalse
bypass_cache - Whether to force a fresh crawl even if the URL has been previously crawled. - Nofalse
extract_blocks - Whether to extract semantical blocks of text from the HTML. - Notrue
word_count_threshold - The minimum number of words a block must contain to be considered meaningful (minimum - value is 5). - No5
extraction_strategy - The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). - NoCosineStrategy
chunking_strategy - The strategy to use for chunking the text before processing (e.g., "RegexChunking"). - NoRegexChunking
css_selector - The CSS selector to target specific parts of the HTML for extraction. - NoNone
verboseWhether to enable verbose logging.Notrue
-
-
- -
-
-

Extraction Strategies

-
-
-
- -
-
-

Chunking Strategies

-
-
-
- -
-
-

🤔 Why building this?

-

- In recent times, we've witnessed a surge of startups emerging, riding the AI hype wave and charging - for services that should rightfully be accessible to everyone. 🌍💸 One such example is scraping and - crawling web pages and transforming them into a format suitable for Large Language Models (LLMs). - 🕸️🤖 We believe that building a business around this is not the right approach; instead, it should - definitely be open-source. 🆓🌟 So, if you possess the skills to build such tools and share our - philosophy, we invite you to join our "Robinhood" band and help set these products free for the - benefit of all. 🤝💪 -

-
-
- -
-
-

⚙️ Installation

-

- To install and run Crawl4AI as a library or a local server, please refer to the 📚 - GitHub repository. -

-
-
- - - - - - diff --git a/pages/index.html b/pages/index.html deleted file mode 100644 index 2947c34a..00000000 --- a/pages/index.html +++ /dev/null @@ -1,73 +0,0 @@ - - - - - - Crawl4AI - - - - - - - - - - - - - - - -
- -
-

🔥🕷️ Crawl4AI: Web Data for your Thoughts

-
-
- 📊 Total Website Processed - 2 -
-
- - {{ try_it | safe }} - -
-
-
- - - -
- {{installation | safe}} {{how_to_guide | safe}} - -
-

Chunking Strategies

-

Content for chunking strategies...

-
-
-

Extraction Strategies

-

Content for extraction strategies...

-
-
-
-
-
- - {{ footer | safe }} - - - diff --git a/pages/index_pooling.html b/pages/index_pooling.html deleted file mode 100644 index 02128f84..00000000 --- a/pages/index_pooling.html +++ /dev/null @@ -1,425 +0,0 @@ - - - - - - Crawl4AI - - - - - - - - - - - - -
-
-

🔥🕷️ Crawl4AI: Open-source LLM Friendly Web scraper

-
-
- -
-
-

Try It Now

-
- - - -
- - -
- -
-
- -
-
- - - -
-
-
- - -
-
-
-
- - - -
-
-
-                                    
-                                    
-                                
- - -
-
-
-
-
- -
-
-

🤔 Why building this?

-

- In recent times, we've seen numerous startups emerging, riding the AI hype wave and charging for - services that should rightfully be accessible to everyone. 🌍💸 One for example is to scrap and crawl - a web page, and transform it o a form suitable for LLM. We don't think one should build a business - out of this, but definilty should be opened source. So if you possess the skills to build such things - and you have such philosphy you should join our "Robinhood" band and help set - these products free. 🆓🤝 -

-
-
- -
-
-

⚙️ Installation

-

- To install and run Crawl4AI locally or on your own service, the best way is to use Docker. 🐳 Follow - these steps: -

-
    -
  1. - Clone the GitHub repository: 📥 - git clone https://github.com/unclecode/crawl4ai.git -
  2. -
  3. Navigate to the project directory: 📂 cd crawl4ai
  4. -
  5. - Build the Docker image: 🛠️ docker build -t crawl4ai . On Mac, follow: 🍎 - docker build --platform linux/amd64 -t crawl4ai . -
  6. -
  7. Run the Docker container: ▶️ docker run -p 8000:80 crawl4ai
  8. -
-

- For more detailed instructions and advanced configuration options, please refer to the 📚 - GitHub repository. -

-
-
- - - - - - diff --git a/pages/partial/footer.html b/pages/partial/footer.html deleted file mode 100644 index 3ab189e1..00000000 --- a/pages/partial/footer.html +++ /dev/null @@ -1,36 +0,0 @@ -
-
-

🤔 Why building this?

-

- In recent times, we've witnessed a surge of startups emerging, riding the AI hype wave and charging - for services that should rightfully be accessible to everyone. 🌍💸 One such example is scraping and - crawling web pages and transforming them into a format suitable for Large Language Models (LLMs). - 🕸️🤖 We believe that building a business around this is not the right approach; instead, it should - definitely be open-source. 🆓🌟 So, if you possess the skills to build such tools and share our - philosophy, we invite you to join our "Robinhood" band and help set these products free for the - benefit of all. 🤝💪 -

-
-
- - \ No newline at end of file diff --git a/pages/partial/how_to_guide.html b/pages/partial/how_to_guide.html deleted file mode 100644 index 785915c1..00000000 --- a/pages/partial/how_to_guide.html +++ /dev/null @@ -1,174 +0,0 @@ -
-

How to Guide

-
- -
- 🌟 - Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling - fun! -
-
- First Step: Create an instance of WebCrawler and call the - warmup() function. -
-
-
crawler = WebCrawler()
-crawler.warmup()
-
- - -
- 🧠 Understanding 'bypass_cache' and 'include_raw_html' parameters: -
-
First crawl (caches the result):
-
-
result = crawler.run(url="https://www.nbcnews.com/business")
-
-
Second crawl (Force to crawl again):
-
-
result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
-
- ⚠️ Don't forget to set `bypass_cache` to True if you want to try different strategies for the same URL. Otherwise, the cached result will be returned. You can also set `always_by_pass_cache` in constructor to True to always bypass the cache. -
-
-
Crawl result without raw HTML content:
-
-
result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
-
- - -
- 📄 - The 'include_raw_html' parameter, when set to True, includes the raw HTML content - in the response. By default, it is set to True. -
-
Set always_by_pass_cache to True:
-
-
crawler.always_by_pass_cache = True
-
- -
- 📸 - Let's take a screenshot of the page! -
-
-
result = crawler.run(
-    url="https://www.nbcnews.com/business",
-    screenshot=True
-)
-with open("screenshot.png", "wb") as f:
-    f.write(base64.b64decode(result.screenshot))
-
- - - -
- 🧩 Let's add a chunking strategy: RegexChunking! -
-
Using RegexChunking:
-
-
result = crawler.run(
-    url="https://www.nbcnews.com/business",
-    chunking_strategy=RegexChunking(patterns=["\n\n"])
-)
-
-
Using NlpSentenceChunking:
-
-
result = crawler.run(
-    url="https://www.nbcnews.com/business",
-    chunking_strategy=NlpSentenceChunking()
-)
-
- - -
- 🧠 Let's get smarter with an extraction strategy: CosineStrategy! -
-
Using CosineStrategy:
-
-
result = crawler.run(
-    url="https://www.nbcnews.com/business",
-    extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3)
-)
-
- - -
- 🤖 - Time to bring in the big guns: LLMExtractionStrategy without instructions! -
-
Using LLMExtractionStrategy without instructions:
-
-
result = crawler.run(
-    url="https://www.nbcnews.com/business",
-    extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
-)
-
- - -
- 📜 - Let's make it even more interesting: LLMExtractionStrategy with - instructions! -
-
Using LLMExtractionStrategy with instructions:
-
-
result = crawler.run(
-    url="https://www.nbcnews.com/business",
-    extraction_strategy=LLMExtractionStrategy(
-    provider="openai/gpt-4o",
-    api_token=os.getenv('OPENAI_API_KEY'),
-    instruction="I am interested in only financial news"
-)
-)
-
- - -
- 🎯 - Targeted extraction: Let's use a CSS selector to extract only H2 tags! -
-
Using CSS selector to extract H2 tags:
-
-
result = crawler.run(
-    url="https://www.nbcnews.com/business",
-    css_selector="h2"
-)
-
- - -
- 🖱️ - Let's get interactive: Passing JavaScript code to click 'Load More' button! -
-
Using JavaScript to click 'Load More' button:
-
-
js_code = ["""
-const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
-loadMoreButton && loadMoreButton.click();
-"""]
-crawler = WebCrawler(verbos=crawler_strategy, always_by_pass_cache=True)
-result = crawler.run(url="https://www.nbcnews.com/business", js = js_code)
-
Remember that you can pass multiple JavaScript code snippets in the list. They all will be executed in the order they are passed.
-
- - -
- 🎉 - Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth - and crawl the web like a pro! 🕸️ -
-
-
\ No newline at end of file diff --git a/pages/partial/installation.html b/pages/partial/installation.html deleted file mode 100644 index 6a6561cd..00000000 --- a/pages/partial/installation.html +++ /dev/null @@ -1,65 +0,0 @@ -
-

Installation 💻

-

- There are three ways to use Crawl4AI: -

    -
  1. - As a library -
  2. -
  3. - As a local server (Docker) -
  4. -
  5. - As a Google Colab notebook. Open In Colab -
  6. -

    - - -

    To install Crawl4AI as a library, follow these steps:

    - -
      -
    1. - Install the package from GitHub: -
      virtualenv venv
      -source venv/bin/activate
      -pip install "crawl4ai[all] @ git+https://github.com/unclecode/crawl4ai.git"
      -            
      -
    2. -
    3. - Run the following command to load the required models. This is optional, but it will boost the performance and speed of the crawler. You need to do this only once. -
      crawl4ai-download-models
      -
    4. -
    5. - Alternatively, you can clone the repository and install the package locally: -
      virtualenv venv
      -source venv/bin/activate
      -git clone https://github.com/unclecode/crawl4ai.git
      -cd crawl4ai
      -pip install -e .[all]
      -
      -
    6. -
    7. - Use docker to run the local server: -
      docker build -t crawl4ai . 
      -# docker build --platform linux/amd64 -t crawl4ai . For Mac users
      -docker run -d -p 8000:80 crawl4ai
      -
    8. -
    -

    - For more information about how to run Crawl4AI as a local server, please refer to the - GitHub repository. -

    -
\ No newline at end of file diff --git a/pages/partial/try_it.html b/pages/partial/try_it.html deleted file mode 100644 index e3033eec..00000000 --- a/pages/partial/try_it.html +++ /dev/null @@ -1,217 +0,0 @@ -
-
-

Try It Now

-
-
-
- - -
-
-
- - -
-
- - -
-
-
-
- - -
-
- - -
-
- -
- - - -
-
-
- - -
-
- - -
- - -
-
- - -
-
- - - - - -
-
-
- - - - -
-
- -
-
- - - - -
-
-
-                        
-                        
-                    
- - - -
-
-
-
-
diff --git a/pages/tmp.html b/pages/tmp.html deleted file mode 100644 index 7c924676..00000000 --- a/pages/tmp.html +++ /dev/null @@ -1,434 +0,0 @@ -
-
-

Installation 💻

-

There are three ways to use Crawl4AI:

-
    -
  1. As a library
  2. -
  3. As a local server (Docker)
  4. -
  5. - As a Google Colab notebook. - Open In Colab -
  6. -

    - -

    To install Crawl4AI as a library, follow these steps:

    - -
      -
    1. - Install the package from GitHub: -
      pip install git+https://github.com/unclecode/crawl4ai.git
      -
    2. -
    3. - Alternatively, you can clone the repository and install the package locally: -
      virtualenv venv
      -source venv/bin/activate
      -git clone https://github.com/unclecode/crawl4ai.git
      -cd crawl4ai
      -pip install -e .
      -
      -
    4. -
    5. - Use docker to run the local server: -
      docker build -t crawl4ai . 
      -# docker build --platform linux/amd64 -t crawl4ai . For Mac users
      -docker run -d -p 8000:80 crawl4ai
      -
    6. -
    -

    - For more information about how to run Crawl4AI as a local server, please refer to the - GitHub repository. -

    -
-
-
-

How to Guide

-
- -
- 🌟 - Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! -
-
- First Step: Create an instance of WebCrawler and call the - warmup() function. -
-
-
crawler = WebCrawler()
-crawler.warmup()
-
- - -
- 🧠 Understanding 'bypass_cache' and 'include_raw_html' parameters: -
-
First crawl (caches the result):
-
-
result = crawler.run(url="https://www.nbcnews.com/business")
-
-
Second crawl (Force to crawl again):
-
-
result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
-
- ⚠️ Don't forget to set `bypass_cache` to True if you want to try different strategies - for the same URL. Otherwise, the cached result will be returned. You can also set - `always_by_pass_cache` in constructor to True to always bypass the cache. -
-
-
Crawl result without raw HTML content:
-
-
result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
-
- - -
- 📄 - The 'include_raw_html' parameter, when set to True, includes the raw HTML content in the response. - By default, it is set to True. -
-
Set always_by_pass_cache to True:
-
-
crawler.always_by_pass_cache = True
-
- - -
- 🧩 Let's add a chunking strategy: RegexChunking! -
-
Using RegexChunking:
-
-
result = crawler.run(
-url="https://www.nbcnews.com/business",
-chunking_strategy=RegexChunking(patterns=["\n\n"])
-)
-
-
Using NlpSentenceChunking:
-
-
result = crawler.run(
-url="https://www.nbcnews.com/business",
-chunking_strategy=NlpSentenceChunking()
-)
-
- - -
- 🧠 Let's get smarter with an extraction strategy: CosineStrategy! -
-
Using CosineStrategy:
-
-
result = crawler.run(
-url="https://www.nbcnews.com/business",
-extraction_strategy=CosineStrategy(word_count_threshold=20, max_dist=0.2, linkage_method="ward", top_k=3)
-)
-
- - -
- 🤖 - Time to bring in the big guns: LLMExtractionStrategy without instructions! -
-
Using LLMExtractionStrategy without instructions:
-
-
result = crawler.run(
-url="https://www.nbcnews.com/business",
-extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
-)
-
- - -
- 📜 - Let's make it even more interesting: LLMExtractionStrategy with instructions! -
-
Using LLMExtractionStrategy with instructions:
-
-
result = crawler.run(
-url="https://www.nbcnews.com/business",
-extraction_strategy=LLMExtractionStrategy(
-provider="openai/gpt-4o",
-api_token=os.getenv('OPENAI_API_KEY'),
-instruction="I am interested in only financial news"
-)
-)
-
- - -
- 🎯 - Targeted extraction: Let's use a CSS selector to extract only H2 tags! -
-
Using CSS selector to extract H2 tags:
-
-
result = crawler.run(
-url="https://www.nbcnews.com/business",
-css_selector="h2"
-)
-
- - -
- 🖱️ - Let's get interactive: Passing JavaScript code to click 'Load More' button! -
-
Using JavaScript to click 'Load More' button:
-
-
js_code = """
-const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
-loadMoreButton && loadMoreButton.click();
-"""
-crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
-result = crawler.run(url="https://www.nbcnews.com/business")
-
- - -
- 🎉 - Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the - web like a pro! 🕸️ -
-
-
- -
-
-
-

RegexChunking

-

- RegexChunking is a text chunking strategy that splits a given text into smaller parts - using regular expressions. This is useful for preparing large texts for processing by language - models, ensuring they are divided into manageable segments. -

-

Constructor Parameters:

-
    -
  • - patterns (list, optional): A list of regular expression patterns used to split the - text. Default is to split by double newlines (['\n\n']). -
  • -
-

Example usage:

-
chunker = RegexChunking(patterns=[r'\n\n', r'\. '])
-chunks = chunker.chunk("This is a sample text. It will be split into chunks.")
-
-
-
-
-
-

NlpSentenceChunking

-

- NlpSentenceChunking uses a natural language processing model to chunk a given text into - sentences. This approach leverages SpaCy to accurately split text based on sentence boundaries. -

-

Constructor Parameters:

-
    -
  • - None. -
  • -
-

Example usage:

-
chunker = NlpSentenceChunking()
-chunks = chunker.chunk("This is a sample text. It will be split into sentences.")
-
-
-
-
-
-

TopicSegmentationChunking

-

- TopicSegmentationChunking uses the TextTiling algorithm to segment a given text into - topic-based chunks. This method identifies thematic boundaries in the text. -

-

Constructor Parameters:

-
    -
  • - num_keywords (int, optional): The number of keywords to extract for each topic - segment. Default is 3. -
  • -
-

Example usage:

-
chunker = TopicSegmentationChunking(num_keywords=3)
-chunks = chunker.chunk("This is a sample text. It will be split into topic-based segments.")
-
-
-
-
-
-

FixedLengthWordChunking

-

- FixedLengthWordChunking splits a given text into chunks of fixed length, based on the - number of words. -

-

Constructor Parameters:

-
    -
  • - chunk_size (int, optional): The number of words in each chunk. Default is - 100. -
  • -
-

Example usage:

-
chunker = FixedLengthWordChunking(chunk_size=100)
-chunks = chunker.chunk("This is a sample text. It will be split into fixed-length word chunks.")
-
-
-
-
-
-

SlidingWindowChunking

-

- SlidingWindowChunking uses a sliding window approach to chunk a given text. Each chunk - has a fixed length, and the window slides by a specified step size. -

-

Constructor Parameters:

-
    -
  • - window_size (int, optional): The number of words in each chunk. Default is - 100. -
  • -
  • - step (int, optional): The number of words to slide the window. Default is - 50. -
  • -
-

Example usage:

-
chunker = SlidingWindowChunking(window_size=100, step=50)
-chunks = chunker.chunk("This is a sample text. It will be split using a sliding window approach.")
-
-
-
-
-
-
-
-

NoExtractionStrategy

-

- NoExtractionStrategy is a basic extraction strategy that returns the entire HTML - content without any modification. It is useful for cases where no specific extraction is required. - Only clean html, and amrkdown. -

-

Constructor Parameters:

-

None.

-

Example usage:

-
extractor = NoExtractionStrategy()
-extracted_content = extractor.extract(url, html)
-
-
-
-
-
-

LLMExtractionStrategy

-

- LLMExtractionStrategy uses a Language Model (LLM) to extract meaningful blocks or - chunks from the given HTML content. This strategy leverages an external provider for language model - completions. -

-

Constructor Parameters:

-
    -
  • - provider (str, optional): The provider to use for the language model completions. - Default is DEFAULT_PROVIDER (e.g., openai/gpt-4). -
  • -
  • - api_token (str, optional): The API token for the provider. If not provided, it will - try to load from the environment variable OPENAI_API_KEY. -
  • -
  • - instruction (str, optional): An instruction to guide the LLM on how to perform the - extraction. This allows users to specify the type of data they are interested in or set the tone - of the response. Default is None. -
  • -
-

Example usage:

-
extractor = LLMExtractionStrategy(provider='openai', api_token='your_api_token', instruction='Extract only news about AI.')
-extracted_content = extractor.extract(url, html)
-
-

- By providing clear instructions, users can tailor the extraction process to their specific needs, - enhancing the relevance and utility of the extracted content. -

-
-
-
-
-

CosineStrategy

-

- CosineStrategy uses hierarchical clustering based on cosine similarity to extract - clusters of text from the given HTML content. This strategy is suitable for identifying related - content sections. -

-

Constructor Parameters:

-
    -
  • - semantic_filter (str, optional): A string containing keywords for filtering relevant - documents before clustering. If provided, documents are filtered based on their cosine - similarity to the keyword filter embedding. Default is None. -
  • -
  • - word_count_threshold (int, optional): Minimum number of words per cluster. Default - is 20. -
  • -
  • - max_dist (float, optional): The maximum cophenetic distance on the dendrogram to - form clusters. Default is 0.2. -
  • -
  • - linkage_method (str, optional): The linkage method for hierarchical clustering. - Default is 'ward'. -
  • -
  • - top_k (int, optional): Number of top categories to extract. Default is - 3. -
  • -
  • - model_name (str, optional): The model name for embedding generation. Default is - 'BAAI/bge-small-en-v1.5'. -
  • -
-

Example usage:

-
extractor = CosineStrategy(semantic_filter='artificial intelligence', word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name='BAAI/bge-small-en-v1.5')
-extracted_content = extractor.extract(url, html)
-
-

Cosine Similarity Filtering

-

- When a semantic_filter is provided, the CosineStrategy applies an - embedding-based filtering process to select relevant documents before performing hierarchical - clustering. -

-
-
-
-
-

TopicExtractionStrategy

-

- TopicExtractionStrategy uses the TextTiling algorithm to segment the HTML content into - topics and extracts keywords for each segment. This strategy is useful for identifying and - summarizing thematic content. -

-

Constructor Parameters:

-
    -
  • - num_keywords (int, optional): Number of keywords to represent each topic segment. - Default is 3. -
  • -
-

Example usage:

-
extractor = TopicExtractionStrategy(num_keywords=3)
-extracted_content = extractor.extract(url, html)
-
-
-
-
-
From d7c5b900b8d5d965d56417ac94681e7a11bbb1ee Mon Sep 17 00:00:00 2001 From: unclecode Date: Sun, 24 Nov 2024 19:35:53 +0800 Subject: [PATCH 04/70] feat: add support for arm64 platform in Docker commands and update INSTALL_TYPE variable in docker-compose --- README.md | 9 +++++++++ docker-compose.yml | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fa88a507..6c5e256e 100644 --- a/README.md +++ b/README.md @@ -142,6 +142,9 @@ docker pull unclecode/crawl4ai:gpu # GPU-enabled version # Run the container docker run -p 11235:11235 unclecode/crawl4ai:basic # Replace 'basic' with your chosen version +# In case you want to set platform to arm64 +docker run --platform linux/arm64 -p 11235:11235 unclecode/crawl4ai:basic + # In case to allocate more shared memory for the container docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic ``` @@ -158,6 +161,12 @@ docker build -t crawl4ai:local \ --build-arg INSTALL_TYPE=basic \ # Options: basic, all . +# In case you want to set platform to arm64 +docker build -t crawl4ai:local \ + --build-arg INSTALL_TYPE=basic \ # Options: basic, all + --platform linux/arm64 \ + . + # Run your local build docker run -p 11235:11235 crawl4ai:local ``` diff --git a/docker-compose.yml b/docker-compose.yml index 1097ef11..b93beda9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,8 +4,8 @@ services: context: . dockerfile: Dockerfile args: - PYTHON_VERSION: 3.10 - INSTALL_TYPE: all + PYTHON_VERSION: "3.10" + INSTALL_TYPE: ${INSTALL_TYPE:-basic} ENABLE_GPU: false profiles: ["local"] ports: From de43505ae4177ddf671c8b765f2f55c28a740e47 Mon Sep 17 00:00:00 2001 From: unclecode Date: Sun, 24 Nov 2024 19:36:30 +0800 Subject: [PATCH 05/70] feat: update version to 0.3.742 --- crawl4ai/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 05bfd336..f06970ce 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.741" \ No newline at end of file +__version__ = "0.3.742" \ No newline at end of file From b09a86c0c1bc1036ff4954da991dfbccf65534cd Mon Sep 17 00:00:00 2001 From: unclecode Date: Sun, 24 Nov 2024 19:40:10 +0800 Subject: [PATCH 06/70] chore: remove deprecated Docker Compose configurations for crawl4ai service --- docker-compose.hub.yml | 27 --------------------------- docker-compose.local.yml | 33 --------------------------------- 2 files changed, 60 deletions(-) delete mode 100644 docker-compose.hub.yml delete mode 100644 docker-compose.local.yml diff --git a/docker-compose.hub.yml b/docker-compose.hub.yml deleted file mode 100644 index 9bcfa982..00000000 --- a/docker-compose.hub.yml +++ /dev/null @@ -1,27 +0,0 @@ -services: - crawl4ai: - image: unclecode/crawl4ai:basic # Pull image from Docker Hub - ports: - - "11235:11235" # FastAPI server - - "8000:8000" # Alternative port - - "9222:9222" # Browser debugging - - "8080:8080" # Additional port - environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token - - OPENAI_API_KEY=${OPENAI_API_KEY:-} # Optional OpenAI API key - - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} # Optional Claude API key - volumes: - - /dev/shm:/dev/shm # Shared memory for browser operations - deploy: - resources: - limits: - memory: 4G - reservations: - memory: 1G - restart: unless-stopped - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:11235/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s diff --git a/docker-compose.local.yml b/docker-compose.local.yml deleted file mode 100644 index 7dc41b47..00000000 --- a/docker-compose.local.yml +++ /dev/null @@ -1,33 +0,0 @@ -services: - crawl4ai: - build: - context: . - dockerfile: Dockerfile - args: - PYTHON_VERSION: 3.10 - INSTALL_TYPE: all - ENABLE_GPU: false - ports: - - "11235:11235" # FastAPI server - - "8000:8000" # Alternative port - - "9222:9222" # Browser debugging - - "8080:8080" # Additional port - environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token - - OPENAI_API_KEY=${OPENAI_API_KEY:-} # Optional OpenAI API key - - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} # Optional Claude API key - volumes: - - /dev/shm:/dev/shm # Shared memory for browser operations - deploy: - resources: - limits: - memory: 4G - reservations: - memory: 1G - restart: unless-stopped - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:11235/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s \ No newline at end of file From 195c0ccf8aa5e0462b97bc8a7f5cff608b69b53a Mon Sep 17 00:00:00 2001 From: unclecode Date: Sun, 24 Nov 2024 19:40:27 +0800 Subject: [PATCH 07/70] chore: remove deprecated Docker Compose configurations for crawl4ai service --- docker-compose.hub.yml | 27 --------------------------- docker-compose.local.yml | 33 --------------------------------- 2 files changed, 60 deletions(-) delete mode 100644 docker-compose.hub.yml delete mode 100644 docker-compose.local.yml diff --git a/docker-compose.hub.yml b/docker-compose.hub.yml deleted file mode 100644 index 9bcfa982..00000000 --- a/docker-compose.hub.yml +++ /dev/null @@ -1,27 +0,0 @@ -services: - crawl4ai: - image: unclecode/crawl4ai:basic # Pull image from Docker Hub - ports: - - "11235:11235" # FastAPI server - - "8000:8000" # Alternative port - - "9222:9222" # Browser debugging - - "8080:8080" # Additional port - environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token - - OPENAI_API_KEY=${OPENAI_API_KEY:-} # Optional OpenAI API key - - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} # Optional Claude API key - volumes: - - /dev/shm:/dev/shm # Shared memory for browser operations - deploy: - resources: - limits: - memory: 4G - reservations: - memory: 1G - restart: unless-stopped - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:11235/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s diff --git a/docker-compose.local.yml b/docker-compose.local.yml deleted file mode 100644 index 7dc41b47..00000000 --- a/docker-compose.local.yml +++ /dev/null @@ -1,33 +0,0 @@ -services: - crawl4ai: - build: - context: . - dockerfile: Dockerfile - args: - PYTHON_VERSION: 3.10 - INSTALL_TYPE: all - ENABLE_GPU: false - ports: - - "11235:11235" # FastAPI server - - "8000:8000" # Alternative port - - "9222:9222" # Browser debugging - - "8080:8080" # Additional port - environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token - - OPENAI_API_KEY=${OPENAI_API_KEY:-} # Optional OpenAI API key - - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} # Optional Claude API key - volumes: - - /dev/shm:/dev/shm # Shared memory for browser operations - deploy: - resources: - limits: - memory: 4G - reservations: - memory: 1G - restart: unless-stopped - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:11235/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s \ No newline at end of file From c6a022132b9fff4db14586a55c95f346ac3da5f7 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 27 Nov 2024 14:55:56 +0800 Subject: [PATCH 08/70] docs: update CONTRIBUTORS.md to acknowledge aadityakanjolia4 for fixing 'CustomHTML2Text' bug --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 0b5dcede..81e916cb 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -10,6 +10,7 @@ We would like to thank the following people for their contributions to Crawl4AI: ## Community Contributors +- [aadityakanjolia4](https://github.com/aadityakanjolia4) - Fixing 'CustomHTML2Text' is not defined bug in the code. - [FractalMind](https://github.com/FractalMind) - Created the first official Docker Hub image and fixed Dockerfile errors - [ketonkss4](https://github.com/ketonkss4) - Identified Selenium's new capabilities, helping reduce dependencies - [jonymusky](https://github.com/jonymusky) - Javascript execution documentation, and wait_for From 73661f7d1fd37111e34e4dc9ec10f87d5a5f3afe Mon Sep 17 00:00:00 2001 From: zhounan Date: Wed, 27 Nov 2024 15:04:20 +0800 Subject: [PATCH 09/70] docs: enhance development installation instructions (#286) Thanks for your contribution. I'm merging your changes and I'll add your name to our contributor list. Thank you so much. --- README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6c5e256e..5ba33dea 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,15 @@ For contributors who plan to modify the source code: ```bash git clone https://github.com/unclecode/crawl4ai.git cd crawl4ai -pip install -e . +pip install -e . # Basic installation in editable mode +``` +Install optional features: +```bash +pip install -e ".[torch]" # With PyTorch features +pip install -e ".[transformer]" # With Transformer features +pip install -e ".[cosine]" # With cosine similarity features +pip install -e ".[sync]" # With synchronous crawling (Selenium) +pip install -e ".[all]" # Install all optional features ``` ## One-Click Deployment 🚀 From f998e9e94906302a4ee32cd5e581f4fa7bd22021 Mon Sep 17 00:00:00 2001 From: Hamza Farhan Date: Wed, 27 Nov 2024 16:20:54 +0500 Subject: [PATCH 10/70] Fix: handled the cases where markdown_with_citations, references_markdown, and filtered_html might not be defined. (#293) Thanks, dear Farhan, for the changes you made in the code. I accepted and merged them into the main branch. Also, I will add your name to our contributor list. Thank you so much. --- crawl4ai/markdown_generation_strategy.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index 7922c413..249bc1ce 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -84,6 +84,8 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): raw_markdown = raw_markdown.replace(' ```', '```') # Convert links to citations + markdown_with_citations: str = "" + references_markdown: str = "" if citations: markdown_with_citations, references_markdown = self.convert_links_to_citations( raw_markdown, base_url @@ -91,9 +93,9 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): # Generate fit markdown if content filter is provided fit_markdown: Optional[str] = None + filtered_html: Optional[str] = None if content_filter: - filtered_html = content_filter.filter_content(cleaned_html) - filtered_html = '\n'.join('
{}
'.format(s) for s in filtered_html) + filtered_html = '\n'.join('
{}
'.format(s) for s in content_filter.filter_content(cleaned_html)) fit_markdown = h.handle(filtered_html) return MarkdownGenerationResult( @@ -101,7 +103,7 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): markdown_with_citations=markdown_with_citations, references_markdown=references_markdown, fit_markdown=fit_markdown, - fit_html=filtered_html + fit_html=filtered_html, ) def fast_urljoin(base: str, url: str) -> str: From 24723b2f100ed25747b1b84a833f82e17340b457 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 12:45:05 +0800 Subject: [PATCH 11/70] Enhance features and documentation - Updated version to 0.3.743 - Improved ManagedBrowser configuration with dynamic host/port - Implemented fast HTML formatting in web crawler - Enhanced markdown generation with a new generator class - Improved sanitization and utility functions - Added contributor details and pull request acknowledgments - Updated documentation for clearer usage scenarios - Adjusted tests to reflect class name changes --- CONTRIBUTORS.md | 8 +++ crawl4ai/__version__.py | 2 +- crawl4ai/async_crawler_strategy.py | 24 +++++---- crawl4ai/async_webcrawler.py | 12 +++-- crawl4ai/content_scraping_strategy.py | 19 ++++--- crawl4ai/markdown_generation_strategy.py | 14 ++++-- crawl4ai/utils.py | 64 +++++++++++++++++++++--- docs/md_v2/advanced/hooks-auth.md | 8 ++- tests/async/test_markdown_genertor.py | 14 +++--- 9 files changed, 123 insertions(+), 42 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 0b5dcede..deb46a9c 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -10,11 +10,19 @@ We would like to thank the following people for their contributions to Crawl4AI: ## Community Contributors +- [aadityakanjolia4](https://github.com/aadityakanjolia4) - Fix for `CustomHTML2Text` is not defined. - [FractalMind](https://github.com/FractalMind) - Created the first official Docker Hub image and fixed Dockerfile errors - [ketonkss4](https://github.com/ketonkss4) - Identified Selenium's new capabilities, helping reduce dependencies - [jonymusky](https://github.com/jonymusky) - Javascript execution documentation, and wait_for - [datehoer](https://github.com/datehoer) - Add browser prxy support +## Pull Requests + +- [nelzomal](https://github.com/nelzomal) - Enhance development installation instructions [#286](https://github.com/unclecode/crawl4ai/pull/286) +- [HamzaFarhan](https://github.com/HamzaFarhan) - Handled the cases where markdown_with_citations, references_markdown, and filtered_html might not be defined [#293](https://github.com/unclecode/crawl4ai/pull/293) +- [NanmiCoder](https://github.com/NanmiCoder) - fix: crawler strategy exception handling and fixes [#271](https://github.com/unclecode/crawl4ai/pull/271) + + ## Other Contributors - [Gokhan](https://github.com/gkhngyk) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 05bfd336..37e3c08a 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.741" \ No newline at end of file +__version__ = "0.3.743" \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 3f332eb0..882f9a50 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -35,13 +35,14 @@ stealth_config = StealthConfig( class ManagedBrowser: - def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None): + def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None, host: str = "localhost", debugging_port: int = 9222): self.browser_type = browser_type self.user_data_dir = user_data_dir self.headless = headless self.browser_process = None self.temp_dir = None - self.debugging_port = 9222 + self.debugging_port = debugging_port + self.host = host self.logger = logger self.shutting_down = False @@ -70,7 +71,7 @@ class ManagedBrowser: # Monitor browser process output for errors asyncio.create_task(self._monitor_browser_process()) await asyncio.sleep(2) # Give browser time to start - return f"http://localhost:{self.debugging_port}" + return f"http://{self.host}:{self.debugging_port}" except Exception as e: await self.cleanup() raise Exception(f"Failed to start browser: {e}") @@ -416,13 +417,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): else: raise ValueError(f"Invalid hook type: {hook_type}") - async def execute_hook(self, hook_type: str, *args): + async def execute_hook(self, hook_type: str, *args, **kwargs): hook = self.hooks.get(hook_type) if hook: if asyncio.iscoroutinefunction(hook): - return await hook(*args) + return await hook(*args, **kwargs) else: - return hook(*args) + return hook(*args, **kwargs) return args[0] if args else None def update_user_agent(self, user_agent: str): @@ -642,6 +643,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): session_id = kwargs.get("session_id") # Handle page creation differently for managed browser + context = None if self.use_managed_browser: if session_id: # Reuse existing session if available @@ -760,7 +762,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return response if not kwargs.get("js_only", False): - await self.execute_hook('before_goto', page) + await self.execute_hook('before_goto', page, context = context) response = await page.goto( @@ -773,7 +775,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # response = await page.goto("about:blank") # await page.evaluate(f"window.location.href = '{url}'") - await self.execute_hook('after_goto', page) + await self.execute_hook('after_goto', page, context = context) # Get status code and headers status_code = response.status @@ -838,7 +840,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # await page.wait_for_timeout(100) # Check for on execution event - await self.execute_hook('on_execution_started', page) + await self.execute_hook('on_execution_started', page, context = context) if kwargs.get("simulate_user", False) or kwargs.get("magic", False): # Simulate user interactions @@ -924,7 +926,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if kwargs.get("process_iframes", False): page = await self.process_iframes(page) - await self.execute_hook('before_retrieve_html', page) + await self.execute_hook('before_retrieve_html', page, context = context) # Check if delay_before_return_html is set then wait for that time delay_before_return_html = kwargs.get("delay_before_return_html") if delay_before_return_html: @@ -935,7 +937,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.remove_overlay_elements(page) html = await page.content() - await self.execute_hook('before_return_html', page, html) + await self.execute_hook('before_return_html', page, html, context = context) # Check if kwargs has screenshot=True then take screenshot screenshot_data = None diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index b8be6f35..5a46fe39 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -25,7 +25,8 @@ from .config import ( from .utils import ( sanitize_input_encode, InvalidCSSSelectorError, - format_html + format_html, + fast_format_html ) from urllib.parse import urlparse import random @@ -534,16 +535,17 @@ class AsyncWebCrawler: "timing": time.perf_counter() - t1 } ) - - - screenshot = None if not screenshot else screenshot + + if kwargs.get("prettiify", False): + cleaned_html = fast_format_html(cleaned_html) + return CrawlResult( url=url, html=html, - cleaned_html=format_html(cleaned_html), + cleaned_html=cleaned_html, markdown_v2=markdown_v2, markdown=markdown, fit_markdown=fit_markdown, diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index ea6a2ef8..ec6c3361 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -10,7 +10,7 @@ from urllib.parse import urljoin from requests.exceptions import InvalidSchema # from .content_cleaning_strategy import ContentCleaningStrategy from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter -from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerationStrategy +from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator from .models import MarkdownGenerationResult from .utils import ( sanitize_input_encode, @@ -105,21 +105,28 @@ class WebScrapingStrategy(ContentScrapingStrategy): Returns: Dict containing markdown content in various formats """ - markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerationStrategy()) + markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator()) if markdown_generator: try: + if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter: + markdown_generator.content_filter = BM25ContentFilter( + user_query=kwargs.get('fit_markdown_user_query', None), + bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) + ) + markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( cleaned_html=cleaned_html, base_url=url, - html2text_options=kwargs.get('html2text', {}), - content_filter=kwargs.get('content_filter', None) + html2text_options=kwargs.get('html2text', {}) ) + help_message = """""" + return { 'markdown': markdown_result.raw_markdown, - 'fit_markdown': markdown_result.fit_markdown or "Set flag 'fit_markdown' to True to get cleaned HTML content.", - 'fit_html': markdown_result.fit_html or "Set flag 'fit_markdown' to True to get cleaned HTML content.", + 'fit_markdown': markdown_result.fit_markdown, + 'fit_html': markdown_result.fit_html, 'markdown_v2': markdown_result } except Exception as e: diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index 7922c413..b1e43f9d 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -11,6 +11,8 @@ LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)') class MarkdownGenerationStrategy(ABC): """Abstract base class for markdown generation strategies.""" + def __init__(self, content_filter: Optional[RelevantContentFilter] = None): + self.content_filter = content_filter @abstractmethod def generate_markdown(self, @@ -23,8 +25,10 @@ class MarkdownGenerationStrategy(ABC): """Generate markdown from cleaned HTML.""" pass -class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): +class DefaultMarkdownGenerator(MarkdownGenerationStrategy): """Default implementation of markdown generation strategy.""" + def __init__(self, content_filter: Optional[RelevantContentFilter] = None): + super().__init__(content_filter) def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]: link_map = {} @@ -84,14 +88,18 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): raw_markdown = raw_markdown.replace(' ```', '```') # Convert links to citations + markdown_with_citations: str = "" + references_markdown: str = "" if citations: markdown_with_citations, references_markdown = self.convert_links_to_citations( raw_markdown, base_url ) # Generate fit markdown if content filter is provided - fit_markdown: Optional[str] = None - if content_filter: + fit_markdown: Optional[str] = "" + filtered_html: Optional[str] = "" + if content_filter or self.content_filter: + content_filter = content_filter or self.content_filter filtered_html = content_filter.filter_content(cleaned_html) filtered_html = '\n'.join('
{}
'.format(s) for s in filtered_html) fit_markdown = h.handle(filtered_html) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index b07562df..aaf27e91 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -233,12 +233,17 @@ def sanitize_html(html): def sanitize_input_encode(text: str) -> str: """Sanitize input to handle potential encoding issues.""" try: - # Attempt to encode and decode as UTF-8 to handle potential encoding issues - return text.encode('utf-8', errors='ignore').decode('utf-8') - except UnicodeEncodeError as e: - print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}") - # Fall back to ASCII if UTF-8 fails - return text.encode('ascii', errors='ignore').decode('ascii') + try: + if not text: + return '' + # Attempt to encode and decode as UTF-8 to handle potential encoding issues + return text.encode('utf-8', errors='ignore').decode('utf-8') + except UnicodeEncodeError as e: + print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}") + # Fall back to ASCII if UTF-8 fails + return text.encode('ascii', errors='ignore').decode('ascii') + except Exception as e: + raise ValueError(f"Error sanitizing input: {str(e)}") from e def escape_json_string(s): """ @@ -1079,9 +1084,54 @@ def wrap_text(draw, text, font, max_width): return '\n'.join(lines) def format_html(html_string): - soup = BeautifulSoup(html_string, 'html.parser') + soup = BeautifulSoup(html_string, 'lxml.parser') return soup.prettify() +def fast_format_html(html_string): + """ + A fast HTML formatter that uses string operations instead of parsing. + + Args: + html_string (str): The HTML string to format + + Returns: + str: The formatted HTML string + """ + # Initialize variables + indent = 0 + indent_str = " " # Two spaces for indentation + formatted = [] + in_content = False + + # Split by < and > to separate tags and content + parts = html_string.replace('>', '>\n').replace('<', '\n<').split('\n') + + for part in parts: + if not part.strip(): + continue + + # Handle closing tags + if part.startswith(''): + formatted.append(indent_str * indent + part) + + # Handle opening tags + elif part.startswith('<'): + formatted.append(indent_str * indent + part) + indent += 1 + + # Handle content between tags + else: + content = part.strip() + if content: + formatted.append(indent_str * indent + content) + + return '\n'.join(formatted) + def normalize_url(href, base_url): """Normalize URLs to ensure consistent format""" from urllib.parse import urljoin, urlparse diff --git a/docs/md_v2/advanced/hooks-auth.md b/docs/md_v2/advanced/hooks-auth.md index e4b7d7ce..8da3a1cc 100644 --- a/docs/md_v2/advanced/hooks-auth.md +++ b/docs/md_v2/advanced/hooks-auth.md @@ -18,7 +18,7 @@ Let's see how we can customize the AsyncWebCrawler using hooks! In this example, import asyncio from crawl4ai import AsyncWebCrawler from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy -from playwright.async_api import Page, Browser +from playwright.async_api import Page, Browser, BrowserContext async def on_browser_created(browser: Browser): print("[HOOK] on_browser_created") @@ -71,7 +71,11 @@ from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy async def main(): print("\n🔗 Using Crawler Hooks: Let's see how we can customize the AsyncWebCrawler using hooks!") - crawler_strategy = AsyncPlaywrightCrawlerStrategy(verbose=True) + initial_cookies = [ + {"name": "sessionId", "value": "abc123", "domain": ".example.com"}, + {"name": "userId", "value": "12345", "domain": ".example.com"} + ] + crawler_strategy = AsyncPlaywrightCrawlerStrategy(verbose=True, cookies=initial_cookies) crawler_strategy.set_hook('on_browser_created', on_browser_created) crawler_strategy.set_hook('before_goto', before_goto) crawler_strategy.set_hook('after_goto', after_goto) diff --git a/tests/async/test_markdown_genertor.py b/tests/async/test_markdown_genertor.py index 025a0318..2b1102ab 100644 --- a/tests/async/test_markdown_genertor.py +++ b/tests/async/test_markdown_genertor.py @@ -11,7 +11,7 @@ import asyncio import os import time from typing import Dict, Any -from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerationStrategy +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator # Get current directory __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) @@ -41,7 +41,7 @@ def test_basic_markdown_conversion(): with open(__location__ + "/data/wikipedia.html", "r") as f: cleaned_html = f.read() - generator = DefaultMarkdownGenerationStrategy() + generator = DefaultMarkdownGenerator() start_time = time.perf_counter() result = generator.generate_markdown( @@ -70,7 +70,7 @@ def test_relative_links(): Also an [image](/images/test.png) and another [page](/wiki/Banana). """ - generator = DefaultMarkdownGenerationStrategy() + generator = DefaultMarkdownGenerator() result = generator.generate_markdown( cleaned_html=markdown, base_url="https://en.wikipedia.org" @@ -86,7 +86,7 @@ def test_duplicate_links(): Here's a [link](/test) and another [link](/test) and a [different link](/other). """ - generator = DefaultMarkdownGenerationStrategy() + generator = DefaultMarkdownGenerator() result = generator.generate_markdown( cleaned_html=markdown, base_url="https://example.com" @@ -102,7 +102,7 @@ def test_link_descriptions(): Here's a [link with title](/test "Test Title") and a [link with description](/other) to test. """ - generator = DefaultMarkdownGenerationStrategy() + generator = DefaultMarkdownGenerator() result = generator.generate_markdown( cleaned_html=markdown, base_url="https://example.com" @@ -120,7 +120,7 @@ def test_performance_large_document(): iterations = 5 times = [] - generator = DefaultMarkdownGenerationStrategy() + generator = DefaultMarkdownGenerator() for i in range(iterations): start_time = time.perf_counter() @@ -144,7 +144,7 @@ def test_image_links(): And a regular [link](/page). """ - generator = DefaultMarkdownGenerationStrategy() + generator = DefaultMarkdownGenerator() result = generator.generate_markdown( cleaned_html=markdown, base_url="https://example.com" From 3ff0b0b2c472f6adfd864f580a5a73de65505e5b Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 12:48:07 +0800 Subject: [PATCH 12/70] feat: update changelog for version 0.3.743 with new features, improvements, and contributor acknowledgments --- CHANGELOG.md | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8e5cc91a..5ec79639 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,53 @@ # Changelog +## [0.3.743] November 27, 2024 + +Enhance features and documentation +- Updated version to 0.3.743 +- Improved ManagedBrowser configuration with dynamic host/port +- Implemented fast HTML formatting in web crawler +- Enhanced markdown generation with a new generator class +- Improved sanitization and utility functions +- Added contributor details and pull request acknowledgments +- Updated documentation for clearer usage scenarios +- Adjusted tests to reflect class name changes + +### CONTRIBUTORS.md +Added new contributors and pull request details. +Updated community contributions and acknowledged pull requests. + +### crawl4ai/__version__.py +Version update. +Bumped version to 0.3.743. + +### crawl4ai/async_crawler_strategy.py +Improved ManagedBrowser configuration. +Enhanced browser initialization with configurable host and debugging port; improved hook execution. + +### crawl4ai/async_webcrawler.py +Optimized HTML processing. +Implemented 'fast_format_html' for optimized HTML formatting; applied it when 'prettiify' is enabled. + +### crawl4ai/content_scraping_strategy.py +Enhanced markdown generation strategy. +Updated to use DefaultMarkdownGenerator and improved markdown generation with filters option. + +### crawl4ai/markdown_generation_strategy.py +Refactored markdown generation class. +Renamed DefaultMarkdownGenerationStrategy to DefaultMarkdownGenerator; added content filter handling. + +### crawl4ai/utils.py +Enhanced utility functions. +Improved input sanitization and enhanced HTML formatting method. + +### docs/md_v2/advanced/hooks-auth.md +Improved documentation for hooks. +Updated code examples to include cookies in crawler strategy initialization. + +### tests/async/test_markdown_genertor.py +Refactored tests to match class renaming. +Updated tests to use renamed DefaultMarkdownGenerator class. + ## [0.3.74] November 17, 2024 This changelog details the updates and changes introduced in Crawl4AI version 0.3.74. It's designed to inform developers about new features, modifications to existing components, removals, and other important information. From c2d47848102138e226ab06a4e2c40c80aef2a2cd Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 12:56:31 +0800 Subject: [PATCH 13/70] fix: resolve merge conflict in DefaultMarkdownGenerator affecting fit_markdown generation --- crawl4ai/markdown_generation_strategy.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index 377f6c84..f242054d 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -96,7 +96,6 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy): ) # Generate fit markdown if content filter is provided -<<<<<<< HEAD fit_markdown: Optional[str] = "" filtered_html: Optional[str] = "" if content_filter or self.content_filter: @@ -104,8 +103,6 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy): filtered_html = content_filter.filter_content(cleaned_html) filtered_html = '\n'.join('
{}
'.format(s) for s in filtered_html) fit_markdown = h.handle(filtered_html) ->>>>>>> origin/main - fit_markdown = h.handle(filtered_html) return MarkdownGenerationResult( raw_markdown=raw_markdown, From e4acd18429cf93ae7cd454c6b433fad703dee21c Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 13:06:30 +0800 Subject: [PATCH 14/70] docs: update README for version 0.3.743 with new features, enhancements, and contributor acknowledgments --- README.md | 125 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 87 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 5ba33dea..16d154b5 100644 --- a/README.md +++ b/README.md @@ -11,20 +11,15 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. 🆓🌐 -## New in 0.3.74 ✨ +## New in 0.3.743 ✨ -- 🚀 **Blazing Fast Scraping**: Significantly improved scraping speed. -- 📥 **Download Manager**: Integrated file crawling, downloading, and tracking within `CrawlResult`. -- 📝 **Markdown Strategy**: Flexible system for custom markdown generation and formats. -- 🔗 **LLM-Friendly Citations**: Auto-converts links to numbered citations with reference lists. -- 🔎 **Markdown Filter**: BM25-based content extraction for cleaner, relevant markdown. -- 🖼️ **Image Extraction**: Supports `srcset`, `picture`, and responsive image formats. -- 🗂️ **Local/Raw HTML**: Crawl `file://` paths and raw HTML (`raw:`) directly. -- 🤖 **Browser Control**: Custom browser setups with stealth integration to bypass bots. -- ☁️ **API & Cache Boost**: CORS, static serving, and enhanced filesystem-based caching. -- 🐳 **API Gateway**: Run as an API service with secure token authentication. -- 🛠️ **Database Upgrades**: Optimized for larger content sets with faster caching. -- 🐛 **Bug Fixes**: Resolved browser context issues, memory leaks, and improved error handling. +🚀 **Improved ManagedBrowser Configuration**: Dynamic host and port support for more flexible browser management. +📝 **Enhanced Markdown Generation**: New generator class for better formatting and customization. +⚡ **Fast HTML Formatting**: Significantly optimized HTML formatting in the web crawler. +🛠️ **Utility & Sanitization Upgrades**: Improved sanitization and expanded utility functions for streamlined workflows. +👥 **Acknowledgments**: Added contributor details and pull request acknowledgments for better transparency. +📖 **Documentation Updates**: Clearer usage scenarios and updated guidance for better user onboarding. +🧪 **Test Adjustments**: Refined tests to align with recent class name changes. ## Try it Now! @@ -35,31 +30,85 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc ## Features ✨ -- 🆓 Completely free and open-source -- 🚀 Blazing fast performance, outperforming many paid services -- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown) -- 🌐 Multi-browser support (Chromium, Firefox, WebKit) -- 🌍 Supports crawling multiple URLs simultaneously -- 🎨 Extracts and returns all media tags (Images, Audio, and Video) -- 🔗 Extracts all external and internal links -- 📚 Extracts metadata from the page -- 🔄 Custom hooks for authentication, headers, and page modifications -- 🕵️ User-agent customization -- 🖼️ Takes screenshots of pages with enhanced error handling -- 📜 Executes multiple custom JavaScripts before crawling -- 📊 Generates structured output without LLM using JsonCssExtractionStrategy -- 📚 Various chunking strategies: topic-based, regex, sentence, and more -- 🧠 Advanced extraction strategies: cosine clustering, LLM, and more -- 🎯 CSS selector support for precise data extraction -- 📝 Passes instructions/keywords to refine extraction -- 🔒 Proxy support with authentication for enhanced access -- 🔄 Session management for complex multi-page crawling -- 🌐 Asynchronous architecture for improved performance -- 🖼️ Improved image processing with lazy-loading detection -- 🕰️ Enhanced handling of delayed content loading -- 🔑 Custom headers support for LLM interactions -- 🖼️ iframe content extraction for comprehensive analysis -- ⏱️ Flexible timeout and delayed content retrieval options +
+🚀 Performance & Scalability + +- ⚡ **Blazing Fast Scraping**: Outperforms many paid services with cutting-edge optimization. +- 🔄 **Asynchronous Architecture**: Enhanced performance for complex multi-page crawling. +- ⚡ **Dynamic HTML Formatting**: New, fast HTML formatting for streamlined workflows. +- 🗂️ **Large Dataset Optimization**: Improved caching for handling massive content sets. + +
+ +
+🔎 Extraction Capabilities + +- 🖼️ **Comprehensive Media Support**: Extracts images, audio, video, and responsive image formats like `srcset` and `picture`. +- 📚 **Advanced Content Chunking**: Topic-based, regex, sentence-level, and cosine clustering strategies. +- 🎯 **Precise Data Extraction**: Supports CSS selectors and keyword-based refinements. +- 🔗 **All-Inclusive Link Crawling**: Extracts internal and external links. +- 📝 **Markdown Generation**: Enhanced markdown generator class for custom, clean, LLM-friendly outputs. +- 🏷️ **Metadata Extraction**: Fetches metadata directly from pages. + +
+ +
+🌐 Browser Integration + +- 🌍 **Multi-Browser Support**: Works with Chromium, Firefox, and WebKit. +- 🖥️ **ManagedBrowser with Dynamic Config**: Flexible host/port control for tailored setups. +- ⚙️ **Custom Browser Hooks**: Authentication, headers, and page modifications. +- 🕶️ **Stealth Mode**: Bypasses bot detection with advanced techniques. +- 📸 **Screenshots & JavaScript Execution**: Takes screenshots and executes custom JavaScript before crawling. + +
+ +
+📁 Input/Output Flexibility + +- 📂 **Local & Raw HTML Crawling**: Directly processes `file://` paths and raw HTML. +- 🌐 **Custom Headers for LLM**: Tailored headers for enhanced AI interactions. +- 🛠️ **Structured Output Options**: Supports JSON, cleaned HTML, and markdown outputs. + +
+ +
+🔧 Utility & Debugging + +- 🛡️ **Error Handling**: Robust error management for seamless execution. +- 🔐 **Session Management**: Handles complex, multi-page interactions. +- 🧹 **Utility Functions**: Enhanced sanitization and flexible extraction helpers. +- 🕰️ **Delayed Content Loading**: Improved handling of lazy-loading and dynamic content. + +
+ +
+🔐 Security & Accessibility + +- 🕵️ **Proxy Support**: Enables authenticated access for restricted pages. +- 🚪 **API Gateway**: Deploy as an API service with secure token authentication. +- 🌐 **CORS & Static Serving**: Enhanced support for filesystem-based caching and cross-origin requests. + +
+ +
+🌟 Community & Documentation + +- 🙌 **Contributor Acknowledgments**: Recognition for pull requests and contributions. +- 📖 **Clear Documentation**: Simplified and updated for better onboarding and usage. + +
+ +
+🎯 Cutting-Edge Features + +- 🛠️ **BM25-Based Markdown Filtering**: Extracts cleaner, context-relevant markdown. +- 📚 **LLM-Friendly Citations**: Auto-converts links to numbered citations with reference lists. +- 📡 **IFrame Content Extraction**: Comprehensive analysis for embedded content. +- 🕰️ **Flexible Content Retrieval**: Combines timing-based strategies for reliable extractions. + +
+ ## Installation 🛠️ From ce7d49484fc097a834d1eac883ecce6f444ceb1e Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 13:06:46 +0800 Subject: [PATCH 15/70] docs: update README for version 0.3.743 with new features, enhancements, and contributor acknowledgments --- README.md | 125 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 87 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 5ba33dea..16d154b5 100644 --- a/README.md +++ b/README.md @@ -11,20 +11,15 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. 🆓🌐 -## New in 0.3.74 ✨ +## New in 0.3.743 ✨ -- 🚀 **Blazing Fast Scraping**: Significantly improved scraping speed. -- 📥 **Download Manager**: Integrated file crawling, downloading, and tracking within `CrawlResult`. -- 📝 **Markdown Strategy**: Flexible system for custom markdown generation and formats. -- 🔗 **LLM-Friendly Citations**: Auto-converts links to numbered citations with reference lists. -- 🔎 **Markdown Filter**: BM25-based content extraction for cleaner, relevant markdown. -- 🖼️ **Image Extraction**: Supports `srcset`, `picture`, and responsive image formats. -- 🗂️ **Local/Raw HTML**: Crawl `file://` paths and raw HTML (`raw:`) directly. -- 🤖 **Browser Control**: Custom browser setups with stealth integration to bypass bots. -- ☁️ **API & Cache Boost**: CORS, static serving, and enhanced filesystem-based caching. -- 🐳 **API Gateway**: Run as an API service with secure token authentication. -- 🛠️ **Database Upgrades**: Optimized for larger content sets with faster caching. -- 🐛 **Bug Fixes**: Resolved browser context issues, memory leaks, and improved error handling. +🚀 **Improved ManagedBrowser Configuration**: Dynamic host and port support for more flexible browser management. +📝 **Enhanced Markdown Generation**: New generator class for better formatting and customization. +⚡ **Fast HTML Formatting**: Significantly optimized HTML formatting in the web crawler. +🛠️ **Utility & Sanitization Upgrades**: Improved sanitization and expanded utility functions for streamlined workflows. +👥 **Acknowledgments**: Added contributor details and pull request acknowledgments for better transparency. +📖 **Documentation Updates**: Clearer usage scenarios and updated guidance for better user onboarding. +🧪 **Test Adjustments**: Refined tests to align with recent class name changes. ## Try it Now! @@ -35,31 +30,85 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc ## Features ✨ -- 🆓 Completely free and open-source -- 🚀 Blazing fast performance, outperforming many paid services -- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown) -- 🌐 Multi-browser support (Chromium, Firefox, WebKit) -- 🌍 Supports crawling multiple URLs simultaneously -- 🎨 Extracts and returns all media tags (Images, Audio, and Video) -- 🔗 Extracts all external and internal links -- 📚 Extracts metadata from the page -- 🔄 Custom hooks for authentication, headers, and page modifications -- 🕵️ User-agent customization -- 🖼️ Takes screenshots of pages with enhanced error handling -- 📜 Executes multiple custom JavaScripts before crawling -- 📊 Generates structured output without LLM using JsonCssExtractionStrategy -- 📚 Various chunking strategies: topic-based, regex, sentence, and more -- 🧠 Advanced extraction strategies: cosine clustering, LLM, and more -- 🎯 CSS selector support for precise data extraction -- 📝 Passes instructions/keywords to refine extraction -- 🔒 Proxy support with authentication for enhanced access -- 🔄 Session management for complex multi-page crawling -- 🌐 Asynchronous architecture for improved performance -- 🖼️ Improved image processing with lazy-loading detection -- 🕰️ Enhanced handling of delayed content loading -- 🔑 Custom headers support for LLM interactions -- 🖼️ iframe content extraction for comprehensive analysis -- ⏱️ Flexible timeout and delayed content retrieval options +
+🚀 Performance & Scalability + +- ⚡ **Blazing Fast Scraping**: Outperforms many paid services with cutting-edge optimization. +- 🔄 **Asynchronous Architecture**: Enhanced performance for complex multi-page crawling. +- ⚡ **Dynamic HTML Formatting**: New, fast HTML formatting for streamlined workflows. +- 🗂️ **Large Dataset Optimization**: Improved caching for handling massive content sets. + +
+ +
+🔎 Extraction Capabilities + +- 🖼️ **Comprehensive Media Support**: Extracts images, audio, video, and responsive image formats like `srcset` and `picture`. +- 📚 **Advanced Content Chunking**: Topic-based, regex, sentence-level, and cosine clustering strategies. +- 🎯 **Precise Data Extraction**: Supports CSS selectors and keyword-based refinements. +- 🔗 **All-Inclusive Link Crawling**: Extracts internal and external links. +- 📝 **Markdown Generation**: Enhanced markdown generator class for custom, clean, LLM-friendly outputs. +- 🏷️ **Metadata Extraction**: Fetches metadata directly from pages. + +
+ +
+🌐 Browser Integration + +- 🌍 **Multi-Browser Support**: Works with Chromium, Firefox, and WebKit. +- 🖥️ **ManagedBrowser with Dynamic Config**: Flexible host/port control for tailored setups. +- ⚙️ **Custom Browser Hooks**: Authentication, headers, and page modifications. +- 🕶️ **Stealth Mode**: Bypasses bot detection with advanced techniques. +- 📸 **Screenshots & JavaScript Execution**: Takes screenshots and executes custom JavaScript before crawling. + +
+ +
+📁 Input/Output Flexibility + +- 📂 **Local & Raw HTML Crawling**: Directly processes `file://` paths and raw HTML. +- 🌐 **Custom Headers for LLM**: Tailored headers for enhanced AI interactions. +- 🛠️ **Structured Output Options**: Supports JSON, cleaned HTML, and markdown outputs. + +
+ +
+🔧 Utility & Debugging + +- 🛡️ **Error Handling**: Robust error management for seamless execution. +- 🔐 **Session Management**: Handles complex, multi-page interactions. +- 🧹 **Utility Functions**: Enhanced sanitization and flexible extraction helpers. +- 🕰️ **Delayed Content Loading**: Improved handling of lazy-loading and dynamic content. + +
+ +
+🔐 Security & Accessibility + +- 🕵️ **Proxy Support**: Enables authenticated access for restricted pages. +- 🚪 **API Gateway**: Deploy as an API service with secure token authentication. +- 🌐 **CORS & Static Serving**: Enhanced support for filesystem-based caching and cross-origin requests. + +
+ +
+🌟 Community & Documentation + +- 🙌 **Contributor Acknowledgments**: Recognition for pull requests and contributions. +- 📖 **Clear Documentation**: Simplified and updated for better onboarding and usage. + +
+ +
+🎯 Cutting-Edge Features + +- 🛠️ **BM25-Based Markdown Filtering**: Extracts cleaner, context-relevant markdown. +- 📚 **LLM-Friendly Citations**: Auto-converts links to numbered citations with reference lists. +- 📡 **IFrame Content Extraction**: Comprehensive analysis for embedded content. +- 🕰️ **Flexible Content Retrieval**: Combines timing-based strategies for reliable extractions. + +
+ ## Installation 🛠️ From d556dada9fb4003b42cf7d619ff44feef478cf2c Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 13:07:33 +0800 Subject: [PATCH 16/70] docs: update README to keep details open for extraction capabilities, browser integration, input/output flexibility, utility & debugging, security & accessibility, community & documentation, and cutting-edge features --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 16d154b5..cd643211 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc -
+
🔎 Extraction Capabilities - 🖼️ **Comprehensive Media Support**: Extracts images, audio, video, and responsive image formats like `srcset` and `picture`. @@ -52,7 +52,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
-
+
🌐 Browser Integration - 🌍 **Multi-Browser Support**: Works with Chromium, Firefox, and WebKit. @@ -63,7 +63,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
-
+
📁 Input/Output Flexibility - 📂 **Local & Raw HTML Crawling**: Directly processes `file://` paths and raw HTML. @@ -72,7 +72,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
-
+
🔧 Utility & Debugging - 🛡️ **Error Handling**: Robust error management for seamless execution. @@ -82,7 +82,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
-
+
🔐 Security & Accessibility - 🕵️ **Proxy Support**: Enables authenticated access for restricted pages. @@ -91,7 +91,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
-
+
🌟 Community & Documentation - 🙌 **Contributor Acknowledgments**: Recognition for pull requests and contributions. @@ -99,7 +99,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
-
+
🎯 Cutting-Edge Features - 🛠️ **BM25-Based Markdown Filtering**: Extracts cleaner, context-relevant markdown. From 3abb573142d5588a1fc5790e2731ca8641ca4a95 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 13:07:59 +0800 Subject: [PATCH 17/70] docs: update README for version 0.3.743 with improved formatting and contributor acknowledgments --- README.md | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index cd643211..e02d7ef8 100644 --- a/README.md +++ b/README.md @@ -13,13 +13,11 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc ## New in 0.3.743 ✨ -🚀 **Improved ManagedBrowser Configuration**: Dynamic host and port support for more flexible browser management. -📝 **Enhanced Markdown Generation**: New generator class for better formatting and customization. -⚡ **Fast HTML Formatting**: Significantly optimized HTML formatting in the web crawler. -🛠️ **Utility & Sanitization Upgrades**: Improved sanitization and expanded utility functions for streamlined workflows. -👥 **Acknowledgments**: Added contributor details and pull request acknowledgments for better transparency. -📖 **Documentation Updates**: Clearer usage scenarios and updated guidance for better user onboarding. -🧪 **Test Adjustments**: Refined tests to align with recent class name changes. +- 🚀 **Improved ManagedBrowser Configuration**: Dynamic host and port support for more flexible browser management. +- 📝 **Enhanced Markdown Generation**: New generator class for better formatting and customization. +- ⚡ **Fast HTML Formatting**: Significantly optimized HTML formatting in the web crawler. +- 🛠️ **Utility & Sanitization Upgrades**: Improved sanitization and expanded utility functions for streamlined workflows. +- 👥 **Acknowledgments**: Added contributor details and pull request acknowledgments for better transparency. ## Try it Now! From d583aa43ca1404788838820ebfb90d2e8ee8680d Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 15:53:25 +0800 Subject: [PATCH 18/70] refactor: update cache handling in quickstart_async example to use CacheMode enum --- README.md | 470 +++++++++++++++--------------- docs/examples/quickstart_async.py | 95 +++--- 2 files changed, 296 insertions(+), 269 deletions(-) diff --git a/README.md b/README.md index e02d7ef8..5c50cdc5 100644 --- a/README.md +++ b/README.md @@ -29,94 +29,86 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc ## Features ✨
-🚀 Performance & Scalability - -- ⚡ **Blazing Fast Scraping**: Outperforms many paid services with cutting-edge optimization. -- 🔄 **Asynchronous Architecture**: Enhanced performance for complex multi-page crawling. -- ⚡ **Dynamic HTML Formatting**: New, fast HTML formatting for streamlined workflows. -- 🗂️ **Large Dataset Optimization**: Improved caching for handling massive content sets. +📝 Markdown Generation +- 🧹 **Clean Markdown**: Generates clean, structured Markdown with accurate formatting. +- 🎯 **Fit Markdown**: Heuristic-based filtering to remove noise and irrelevant parts for AI-friendly processing. +- 🔗 **Citations and References**: Converts page links into a numbered reference list with clean citations. +- 🛠️ **Custom Strategies**: Users can create their own Markdown generation strategies tailored to specific needs. +- 📚 **BM25 Algorithm**: Employs BM25-based filtering for extracting core information and removing irrelevant content.
-🔎 Extraction Capabilities +📊 Structured Data Extraction -- 🖼️ **Comprehensive Media Support**: Extracts images, audio, video, and responsive image formats like `srcset` and `picture`. -- 📚 **Advanced Content Chunking**: Topic-based, regex, sentence-level, and cosine clustering strategies. -- 🎯 **Precise Data Extraction**: Supports CSS selectors and keyword-based refinements. -- 🔗 **All-Inclusive Link Crawling**: Extracts internal and external links. -- 📝 **Markdown Generation**: Enhanced markdown generator class for custom, clean, LLM-friendly outputs. -- 🏷️ **Metadata Extraction**: Fetches metadata directly from pages. +- 🤖 **LLM-Driven Extraction**: Supports all LLMs (open-source and proprietary) for structured data extraction. +- 🧱 **Chunking Strategies**: Implements chunking (topic-based, regex, sentence-level) for targeted content processing. +- 🌌 **Cosine Similarity**: Find relevant content chunks based on user queries for semantic extraction. +- 🔎 **CSS-Based Extraction**: Fast schema-based data extraction using XPath and CSS selectors. +- 🔧 **Schema Definition**: Define custom schemas for extracting structured JSON from repetitive patterns.
🌐 Browser Integration -- 🌍 **Multi-Browser Support**: Works with Chromium, Firefox, and WebKit. -- 🖥️ **ManagedBrowser with Dynamic Config**: Flexible host/port control for tailored setups. -- ⚙️ **Custom Browser Hooks**: Authentication, headers, and page modifications. -- 🕶️ **Stealth Mode**: Bypasses bot detection with advanced techniques. -- 📸 **Screenshots & JavaScript Execution**: Takes screenshots and executes custom JavaScript before crawling. +- 🖥️ **Managed Browser**: Use user-owned browsers with full control, avoiding bot detection. +- 🔄 **Remote Browser Control**: Connect to Chrome Developer Tools Protocol for remote, large-scale data extraction. +- 🔒 **Session Management**: Preserve browser states and reuse them for multi-step crawling. +- 🧩 **Proxy Support**: Seamlessly connect to proxies with authentication for secure access. +- ⚙️ **Full Browser Control**: Modify headers, cookies, user agents, and more for tailored crawling setups. +- 🌍 **Multi-Browser Support**: Compatible with Chromium, Firefox, and WebKit.
-📁 Input/Output Flexibility +🔎 Crawling & Scraping -- 📂 **Local & Raw HTML Crawling**: Directly processes `file://` paths and raw HTML. -- 🌐 **Custom Headers for LLM**: Tailored headers for enhanced AI interactions. -- 🛠️ **Structured Output Options**: Supports JSON, cleaned HTML, and markdown outputs. +- 🖼️ **Media Support**: Extract images, audio, videos, and responsive image formats like `srcset` and `picture`. +- 🚀 **Dynamic Crawling**: Execute JS and wait for async or sync for dynamic content extraction. +- 📸 **Screenshots**: Capture page screenshots during crawling for debugging or analysis. +- 📂 **Raw Data Crawling**: Directly process raw HTML (`raw:`) or local files (`file://`). +- 🔗 **Comprehensive Link Extraction**: Extracts internal, external links, and embedded iframe content. +- 🛠️ **Customizable Hooks**: Define hooks at every step to customize crawling behavior. +- 💾 **Caching**: Cache data for improved speed and to avoid redundant fetches. +- 📄 **Metadata Extraction**: Retrieve structured metadata from web pages. +- 📡 **IFrame Content Extraction**: Seamless extraction from embedded iframe content.
-🔧 Utility & Debugging +🚀 Deployment +- 🐳 **Dockerized Setup**: Optimized Docker image with API server for easy deployment. +- 🔄 **API Gateway**: One-click deployment with secure token authentication for API-based workflows. +- 🌐 **Scalable Architecture**: Designed for mass-scale production and optimized server performance. +- ⚙️ **DigitalOcean Deployment**: Ready-to-deploy configurations for DigitalOcean and similar platforms. + +
+ +
+🎯 Additional Features + +- 🕶️ **Stealth Mode**: Avoid bot detection by mimicking real users. +- 🏷️ **Tag-Based Content Extraction**: Refine crawling based on custom tags, headers, or metadata. +- 🔗 **Link Analysis**: Extract and analyze all links for detailed data exploration. - 🛡️ **Error Handling**: Robust error management for seamless execution. -- 🔐 **Session Management**: Handles complex, multi-page interactions. -- 🧹 **Utility Functions**: Enhanced sanitization and flexible extraction helpers. -- 🕰️ **Delayed Content Loading**: Improved handling of lazy-loading and dynamic content. +- 🔐 **CORS & Static Serving**: Supports filesystem-based caching and cross-origin requests. +- 📖 **Clear Documentation**: Simplified and updated guides for onboarding and advanced usage. +- 🙌 **Community Recognition**: Acknowledges contributors and pull requests for transparency.
-
-🔐 Security & Accessibility - -- 🕵️ **Proxy Support**: Enables authenticated access for restricted pages. -- 🚪 **API Gateway**: Deploy as an API service with secure token authentication. -- 🌐 **CORS & Static Serving**: Enhanced support for filesystem-based caching and cross-origin requests. - -
- -
-🌟 Community & Documentation - -- 🙌 **Contributor Acknowledgments**: Recognition for pull requests and contributions. -- 📖 **Clear Documentation**: Simplified and updated for better onboarding and usage. - -
- -
-🎯 Cutting-Edge Features - -- 🛠️ **BM25-Based Markdown Filtering**: Extracts cleaner, context-relevant markdown. -- 📚 **LLM-Friendly Citations**: Auto-converts links to numbered citations with reference lists. -- 📡 **IFrame Content Extraction**: Comprehensive analysis for embedded content. -- 🕰️ **Flexible Content Retrieval**: Combines timing-based strategies for reliable extractions. - -
- - ## Installation 🛠️ Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker. -### Using pip 🐍 +
+🐍 Using pip Choose the installation option that best fits your needs: -#### Basic Installation +### Basic Installation For basic web crawling and scraping tasks: @@ -126,7 +118,7 @@ pip install crawl4ai By default, this will install the asynchronous version of Crawl4AI, using Playwright for web crawling. -👉 Note: When you install Crawl4AI, the setup script should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods: +👉 **Note**: When you install Crawl4AI, the setup script should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods: 1. Through the command line: @@ -142,15 +134,19 @@ By default, this will install the asynchronous version of Crawl4AI, using Playwr This second method has proven to be more reliable in some cases. -#### Installation with Synchronous Version +--- -If you need the synchronous version using Selenium: +### Installation with Synchronous Version + +The sync version is deprecated and will be removed in future versions. If you need the synchronous version using Selenium: ```bash pip install crawl4ai[sync] ``` -#### Development Installation +--- + +### Development Installation For contributors who plan to modify the source code: @@ -159,7 +155,9 @@ git clone https://github.com/unclecode/crawl4ai.git cd crawl4ai pip install -e . # Basic installation in editable mode ``` + Install optional features: + ```bash pip install -e ".[torch]" # With PyTorch features pip install -e ".[transformer]" # With Transformer features @@ -168,7 +166,10 @@ pip install -e ".[sync]" # With synchronous crawling (Selenium) pip install -e ".[all]" # Install all optional features ``` -## One-Click Deployment 🚀 +
+ +
+🚀 One-Click Deployment Deploy your own instance of Crawl4AI with one click: @@ -179,14 +180,19 @@ Deploy your own instance of Crawl4AI with one click: The deploy will: - Set up a Docker container with Crawl4AI - Configure Playwright and all dependencies -- Start the FastAPI server on port 11235 +- Start the FastAPI server on port `11235` - Set up health checks and auto-deployment -### Using Docker 🐳 +
+ +
+🐳 Using Docker Crawl4AI is available as Docker images for easy deployment. You can either pull directly from Docker Hub (recommended) or build from the repository. -#### Option 1: Docker Hub (Recommended) +--- + +### Option 1: Docker Hub (Recommended) ```bash # Pull and run from Docker Hub (choose one): @@ -204,7 +210,9 @@ docker run --platform linux/arm64 -p 11235:11235 unclecode/crawl4ai:basic docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic ``` -#### Option 2: Build from Repository +--- + +### Option 2: Build from Repository ```bash # Clone the repository @@ -226,7 +234,12 @@ docker build -t crawl4ai:local \ docker run -p 11235:11235 crawl4ai:local ``` -Quick test (works for both options): +--- + +### Quick Test + +Run a quick test (works for both Docker options): + ```python import requests @@ -243,143 +256,149 @@ result = requests.get(f"http://localhost:11235/task/{task_id}") For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://crawl4ai.com/mkdocs/basic/docker-deployment/). +
+ ## Quick Start 🚀 ```python import asyncio -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode async def main(): async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun(url="https://www.nbcnews.com/business") - print(result.markdown) + print(result.markdown_v2.raw_markdown) # Soone will be change to result.markdown if __name__ == "__main__": asyncio.run(main()) ``` -## Advanced Usage 🔬 +## Advanced Usage Examples 🔬 -### Executing JavaScript and Using CSS Selectors +You can check the project structure in the directory [https://github.com/unclecode/crawl4ai/docs/examples](docs/examples). Over there, you can find a variety of examples; here, some popular examples are shared. + +
+🖥️ Heuristic Markdown Generation with Clean and Fit Markdown ```python import asyncio -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.content_filter_strategy import BM25ContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator async def main(): - async with AsyncWebCrawler(verbose=True) as crawler: - js_code = ["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"] + async with AsyncWebCrawler( + headless=True, + verbose=True, + ) as crawler: result = await crawler.arun( - url="https://www.nbcnews.com/business", - js_code=js_code, - css_selector=".wide-tease-item__description", - bypass_cache=True + url="https://docs.micronaut.io/4.7.6/guide/", + cache_mode=CacheMode.ENABLED, + markdown_generator=DefaultMarkdownGenerator( + content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0) + ), ) - print(result.extracted_content) + print(len(result.markdown)) + print(len(result.fit_markdown)) + print(len(result.markdown_v2.fit_markdown)) if __name__ == "__main__": asyncio.run(main()) ``` -### Using a Proxy +
+ +
+🖥️ Structured Data Extraction and Executing JavaScript ```python import asyncio -from crawl4ai import AsyncWebCrawler - -async def main(): - async with AsyncWebCrawler(verbose=True, proxy="http://127.0.0.1:7890") as crawler: - result = await crawler.arun( - url="https://www.nbcnews.com/business", - bypass_cache=True - ) - print(result.markdown) - -if __name__ == "__main__": - asyncio.run(main()) -``` - -### Extracting Structured Data without LLM - -The `JsonCssExtractionStrategy` allows for precise extraction of structured data from web pages using CSS selectors. - -```python -import asyncio -import json -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +import json -async def extract_news_teasers(): +async def main(): schema = { - "name": "News Teaser Extractor", - "baseSelector": ".wide-tease-item__wrapper", - "fields": [ - { - "name": "category", - "selector": ".unibrow span[data-testid='unibrow-text']", - "type": "text", - }, - { - "name": "headline", - "selector": ".wide-tease-item__headline", - "type": "text", - }, - { - "name": "summary", - "selector": ".wide-tease-item__description", - "type": "text", - }, - { - "name": "time", - "selector": "[data-testid='wide-tease-date']", - "type": "text", - }, - { - "name": "image", - "type": "nested", - "selector": "picture.teasePicture img", - "fields": [ - {"name": "src", "type": "attribute", "attribute": "src"}, - {"name": "alt", "type": "attribute", "attribute": "alt"}, - ], - }, - { - "name": "link", - "selector": "a[href]", - "type": "attribute", - "attribute": "href", - }, - ], - } + "name": "KidoCode Courses", + "baseSelector": "section.charge-methodology .w-tab-content > div", + "fields": [ + { + "name": "section_title", + "selector": "h3.heading-50", + "type": "text", + }, + { + "name": "section_description", + "selector": ".charge-content", + "type": "text", + }, + { + "name": "course_name", + "selector": ".text-block-93", + "type": "text", + }, + { + "name": "course_description", + "selector": ".course-content-text", + "type": "text", + }, + { + "name": "course_icon", + "selector": ".image-92", + "type": "attribute", + "attribute": "src" + } + ] +} extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) - async with AsyncWebCrawler(verbose=True) as crawler: + async with AsyncWebCrawler( + headless=False, + verbose=True + ) as crawler: + + # Create the JavaScript that handles clicking multiple times + js_click_tabs = """ + (async () => { + const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div"); + + for(let tab of tabs) { + // scroll to the tab + tab.scrollIntoView(); + tab.click(); + // Wait for content to load and animations to complete + await new Promise(r => setTimeout(r, 500)); + } + })(); + """ + result = await crawler.arun( - url="https://www.nbcnews.com/business", - extraction_strategy=extraction_strategy, - bypass_cache=True, + url="https://www.kidocode.com/degrees/technology", + extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True), + js_code=[js_click_tabs], + cache_mode=CacheMode.BYPASS ) - assert result.success, "Failed to crawl the page" + companies = json.loads(result.extracted_content) + print(f"Successfully extracted {len(companies)} companies") + print(json.dumps(companies[0], indent=2)) - news_teasers = json.loads(result.extracted_content) - print(f"Successfully extracted {len(news_teasers)} news teasers") - print(json.dumps(news_teasers[0], indent=2)) if __name__ == "__main__": - asyncio.run(extract_news_teasers()) + asyncio.run(main()) ``` -For more advanced usage examples, check out our [Examples](https://crawl4ai.com/mkdocs/extraction/css-advanced/) section in the documentation. +
-### Extracting Structured Data with OpenAI +
+🤖 Extracting Structured Data with LLMs ```python import os import asyncio -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai.extraction_strategy import LLMExtractionStrategy from pydantic import BaseModel, Field @@ -394,6 +413,8 @@ async def main(): url='https://openai.com/api/pricing/', word_count_threshold=1, extraction_strategy=LLMExtractionStrategy( + # Here you can use any provider that Litellm library supports, for instance: ollama/qwen2 + # provider="ollama/qwen2", api_token="no-token", provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'), schema=OpenAIModelFee.schema(), extraction_type="schema", @@ -401,7 +422,7 @@ async def main(): Do not miss any models in the entire content. One extracted model JSON format should look like this: {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""" ), - bypass_cache=True, + cache_mode=CacheMode.BYPASS, ) print(result.extracted_content) @@ -409,105 +430,86 @@ if __name__ == "__main__": asyncio.run(main()) ``` -### Session Management and Dynamic Content Crawling +
-Crawl4AI excels at handling complex scenarios, such as crawling multiple pages with dynamic content loaded via JavaScript. Here's an example of crawling GitHub commits across multiple pages: +
+🤖 Using You own Browswer with Custome User Profile ```python -import asyncio -import re -from bs4 import BeautifulSoup +import os, sys +from pathlib import Path +import asyncio, time from crawl4ai import AsyncWebCrawler -async def crawl_typescript_commits(): - first_commit = "" - async def on_execution_started(page): - nonlocal first_commit - try: - while True: - await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4') - commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4') - commit = await commit.evaluate('(element) => element.textContent') - commit = re.sub(r'\s+', '', commit) - if commit and commit != first_commit: - first_commit = commit - break - await asyncio.sleep(0.5) - except Exception as e: - print(f"Warning: New content didn't appear after JavaScript execution: {e}") +async def test_news_crawl(): + # Create a persistent user data directory + user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile") + os.makedirs(user_data_dir, exist_ok=True) - async with AsyncWebCrawler(verbose=True) as crawler: - crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started) - - url = "https://github.com/microsoft/TypeScript/commits/main" - session_id = "typescript_commits_session" - all_commits = [] - - js_next_page = """ - const button = document.querySelector('a[data-testid="pagination-next-button"]'); - if (button) button.click(); - """ - - for page in range(3): # Crawl 3 pages - result = await crawler.arun( - url=url, - session_id=session_id, - css_selector="li.Box-sc-g0xbh4-0", - js=js_next_page if page > 0 else None, - bypass_cache=True, - js_only=page > 0 - ) - - assert result.success, f"Failed to crawl page {page + 1}" - - soup = BeautifulSoup(result.cleaned_html, 'html.parser') - commits = soup.select("li") - all_commits.extend(commits) - - print(f"Page {page + 1}: Found {len(commits)} commits") - - await crawler.crawler_strategy.kill_session(session_id) - print(f"Successfully crawled {len(all_commits)} commits across 3 pages") - -if __name__ == "__main__": - asyncio.run(crawl_typescript_commits()) + async with AsyncWebCrawler( + verbose=True, + headless=True, + user_data_dir=user_data_dir, + use_persistent_context=True, + headers={ + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Accept-Encoding": "gzip, deflate, br", + "DNT": "1", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Sec-Fetch-User": "?1", + "Cache-Control": "max-age=0", + } + ) as crawler: + url = "ADDRESS_OF_A_CHALLENGING_WEBSITE" + + result = await crawler.arun( + url, + cache_mode=CacheMode.BYPASS, + magic=True, + ) + + print(f"Successfully crawled {url}") + print(f"Content length: {len(result.markdown)}") ``` -This example demonstrates Crawl4AI's ability to handle complex scenarios where content is loaded asynchronously. It crawls multiple pages of GitHub commits, executing JavaScript to load new content and using custom hooks to ensure data is loaded before proceeding. - -For more advanced usage examples, check out our [Examples](https://crawl4ai.com/mkdocs/tutorial/episode_12_Session-Based_Crawling_for_Dynamic_Websites/) section in the documentation.
## Speed Comparison 🚀 +A test was conducted on **[NBC News - Business Section](https://www.nbcnews.com/business)** to compare Crawl4AI and Firecrawl, highlighting Crawl4AI's speed, efficiency, and advanced features. -Crawl4AI is designed with speed as a primary focus. Our goal is to provide the fastest possible response with high-quality data extraction, minimizing abstractions between the data and the user. +--- -We've conducted a speed comparison between Crawl4AI and Firecrawl, a paid service. The results demonstrate Crawl4AI's superior performance: +#### Results Summary -```bash -Firecrawl: -Time taken: 7.02 seconds -Content length: 42074 characters -Images found: 49 +| **Method** | **Time Taken** | **Markdown Length** | **Fit Markdown** | **Images Found** | +|--------------------------------|----------------|----------------------|-------------------|------------------| +| **Firecrawl** | 6.04 seconds | 38,382 characters | - | 52 | +| **Crawl4AI (Simple Crawl)** | 1.06 seconds | 42,027 characters | - | 52 | +| **Crawl4AI (Markdown Plus)** | 1.30 seconds | 54,342 characters | 11,119 characters | 52 | +| **Crawl4AI (JavaScript)** | 1.56 seconds | 75,869 characters | 13,406 characters | 92 | -Crawl4AI (simple crawl): -Time taken: 1.60 seconds -Content length: 18238 characters -Images found: 49 +--- -Crawl4AI (with JavaScript execution): -Time taken: 4.64 seconds -Content length: 40869 characters -Images found: 89 -``` +#### Key Takeaways -As you can see, Crawl4AI outperforms Firecrawl significantly: +1. **Superior Speed**: Crawl4AI processes even advanced crawls up to **6x faster** than Firecrawl, with times as low as **1.06 seconds**. +2. **Rich Content Extraction**: Crawl4AI consistently captures more comprehensive content, producing a **Markdown Plus** output of **54,342 characters**, compared to Firecrawl's **38,382 characters**. +3. **AI-Optimized Output**: With **Fit Markdown**, Crawl4AI removes noise to produce concise, AI-friendly outputs (**11,119–13,406 characters**) tailored for LLM workflows. +4. **Dynamic Content Handling**: Using JavaScript execution, Crawl4AI extracted **92 images** and enriched content dynamically loaded via “Load More” buttons—unmatched by Firecrawl. -- Simple crawl: Crawl4AI is over 4 times faster than Firecrawl. -- With JavaScript execution: Even when executing JavaScript to load more content (doubling the number of images found), Crawl4AI is still faster than Firecrawl's simple crawl. +--- -You can find the full comparison code in our repository at `docs/examples/crawl4ai_vs_firecrawl.py`. +#### Conclusion + +Crawl4AI outshines Firecrawl in speed, completeness, and flexibility. Its advanced features, including **Markdown Plus**, **Fit Markdown**, and **dynamic content handling**, make it the ideal choice for AI-ready web crawling. Whether you're targeting rich structured data or handling complex dynamic websites, Crawl4AI delivers unmatched performance and precision. + +You can find the full comparison code in our repository at [docs/examples/quickstart_async.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.py). ## Documentation 📚 diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index d67a8c30..e50fe456 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -13,7 +13,9 @@ import re from typing import Dict, List from bs4 import BeautifulSoup from pydantic import BaseModel, Field -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.content_filter_strategy import BM25ContentFilter from crawl4ai.extraction_strategy import ( JsonCssExtractionStrategy, LLMExtractionStrategy, @@ -51,7 +53,7 @@ async def simple_example_with_running_js_code(): url="https://www.nbcnews.com/business", js_code=js_code, # wait_for=wait_for, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, ) print(result.markdown[:500]) # Print first 500 characters @@ -61,7 +63,7 @@ async def simple_example_with_css_selector(): result = await crawler.arun( url="https://www.nbcnews.com/business", css_selector=".wide-tease-item__description", - bypass_cache=True, + cache_mode=CacheMode.BYPASS, ) print(result.markdown[:500]) # Print first 500 characters @@ -132,7 +134,7 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""", extra_args=extra_args ), - bypass_cache=True, + cache_mode=CacheMode.BYPASS, ) print(result.extracted_content) @@ -166,7 +168,7 @@ async def extract_structured_data_using_css_extractor(): result = await crawler.arun( url="https://www.coinbase.com/explore", extraction_strategy=extraction_strategy, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, ) assert result.success, "Failed to crawl the page" @@ -213,7 +215,7 @@ async def crawl_dynamic_content_pages_method_1(): session_id=session_id, css_selector="li.Box-sc-g0xbh4-0", js=js_next_page if page > 0 else None, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, js_only=page > 0, headless=False, ) @@ -282,7 +284,7 @@ async def crawl_dynamic_content_pages_method_2(): extraction_strategy=extraction_strategy, js_code=js_next_page_and_wait if page > 0 else None, js_only=page > 0, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, headless=False, ) @@ -343,7 +345,7 @@ async def crawl_dynamic_content_pages_method_3(): js_code=js_next_page if page > 0 else None, wait_for=wait_for if page > 0 else None, js_only=page > 0, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, headless=False, ) @@ -384,7 +386,7 @@ async def crawl_with_user_simultion(): url = "YOUR-URL-HERE" result = await crawler.arun( url=url, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, magic = True, # Automatically detects and removes overlays, popups, and other elements that block content # simulate_user = True,# Causes a series of random mouse movements and clicks to simulate user interaction # override_navigator = True # Overrides the navigator object to make it look like a real user @@ -408,7 +410,7 @@ async def speed_comparison(): params={'formats': ['markdown', 'html']} ) end = time.time() - print("Firecrawl (simulated):") + print("Firecrawl:") print(f"Time taken: {end - start:.2f} seconds") print(f"Content length: {len(scrape_status['markdown'])} characters") print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}") @@ -420,7 +422,7 @@ async def speed_comparison(): result = await crawler.arun( url="https://www.nbcnews.com/business", word_count_threshold=0, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, verbose=False, ) end = time.time() @@ -430,6 +432,25 @@ async def speed_comparison(): print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") print() + # Crawl4AI with advanced content filtering + start = time.time() + result = await crawler.arun( + url="https://www.nbcnews.com/business", + word_count_threshold=0, + markdown_generator=DefaultMarkdownGenerator( + content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0) + ), + cache_mode=CacheMode.BYPASS, + verbose=False, + ) + end = time.time() + print("Crawl4AI (Markdown Plus):") + print(f"Time taken: {end - start:.2f} seconds") + print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters") + print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters") + print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") + print() + # Crawl4AI with JavaScript execution start = time.time() result = await crawler.arun( @@ -438,13 +459,17 @@ async def speed_comparison(): "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" ], word_count_threshold=0, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, + markdown_generator=DefaultMarkdownGenerator( + content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0) + ), verbose=False, ) end = time.time() print("Crawl4AI (with JavaScript execution):") print(f"Time taken: {end - start:.2f} seconds") print(f"Content length: {len(result.markdown)} characters") + print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters") print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") print("\nNote on Speed Comparison:") @@ -483,7 +508,7 @@ async def generate_knowledge_graph(): url = "https://paulgraham.com/love.html" result = await crawler.arun( url=url, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, extraction_strategy=extraction_strategy, # magic=True ) @@ -496,7 +521,7 @@ async def fit_markdown_remove_overlay(): url = "https://janineintheworld.com/places-to-visit-in-central-mexico" result = await crawler.arun( url=url, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, word_count_threshold = 10, remove_overlay_elements=True, screenshot = True @@ -509,31 +534,31 @@ async def fit_markdown_remove_overlay(): async def main(): - await simple_crawl() - await simple_example_with_running_js_code() - await simple_example_with_css_selector() - await use_proxy() - await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) - await extract_structured_data_using_css_extractor() + # await simple_crawl() + # await simple_example_with_running_js_code() + # await simple_example_with_css_selector() + # await use_proxy() + # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) + # await extract_structured_data_using_css_extractor() - # LLM extraction examples - await extract_structured_data_using_llm() - await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) - await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) - await extract_structured_data_using_llm("ollama/llama3.2") + # # LLM extraction examples + # await extract_structured_data_using_llm() + # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) + # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) + # await extract_structured_data_using_llm("ollama/llama3.2") - # You always can pass custom headers to the extraction strategy - custom_headers = { - "Authorization": "Bearer your-custom-token", - "X-Custom-Header": "Some-Value" - } - await extract_structured_data_using_llm(extra_headers=custom_headers) + # # You always can pass custom headers to the extraction strategy + # custom_headers = { + # "Authorization": "Bearer your-custom-token", + # "X-Custom-Header": "Some-Value" + # } + # await extract_structured_data_using_llm(extra_headers=custom_headers) - # await crawl_dynamic_content_pages_method_1() - # await crawl_dynamic_content_pages_method_2() - await crawl_dynamic_content_pages_method_3() + # # await crawl_dynamic_content_pages_method_1() + # # await crawl_dynamic_content_pages_method_2() + # await crawl_dynamic_content_pages_method_3() - await crawl_custom_browser_type() + # await crawl_custom_browser_type() await speed_comparison() From a69f7a953198df1d9d93420161794aafe3fcffcb Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 16:31:41 +0800 Subject: [PATCH 19/70] fix: correct typo in function documentation for clarity and accuracy --- README.md | 184 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 105 insertions(+), 79 deletions(-) diff --git a/README.md b/README.md index 5c50cdc5..c4ef1bd3 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scraper +[✨ Check out what's new in the latest update!](#new-in-03743) + unclecode%2Fcrawl4ai | Trendshift [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers) @@ -9,26 +11,47 @@ [![GitHub Pull Requests](https://img.shields.io/github/issues-pr/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/pulls) [![License](https://img.shields.io/github/license/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/blob/main/LICENSE) -Crawl4AI simplifies asynchronous web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. 🆓🌐 +## 🔥 Crawl4AI: Crawl Smarter, Faster, Freely. For AI. -## New in 0.3.743 ✨ +Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -- 🚀 **Improved ManagedBrowser Configuration**: Dynamic host and port support for more flexible browser management. -- 📝 **Enhanced Markdown Generation**: New generator class for better formatting and customization. -- ⚡ **Fast HTML Formatting**: Significantly optimized HTML formatting in the web crawler. -- 🛠️ **Utility & Sanitization Upgrades**: Improved sanitization and expanded utility functions for streamlined workflows. -- 👥 **Acknowledgments**: Added contributor details and pull request acknowledgments for better transparency. +[✨ Check out what's new in the latest update!](#new-in-03743) + +## 🧐 Why Crawl4AI? + +1. **Built for LLMs**: Creates **smart, concise Markdown** optimized for applications like Retrieval-Augmented Generation (RAG) and fine-tuning. +2. **Lightning Fast**: Delivers results **6x faster** than competitors with real-time, cost-efficient performance. +3. **Flexible Browser Control**: Offers session management, proxies, and custom hooks for precise, seamless data access. +4. **Heuristic Intelligence**: Leverages **advanced algorithms** to extract data efficiently, reducing reliance on costly language models. +5. **Open Source & Deployable**: 100% open-source with no API keys or registration required-ready for **Docker and cloud integration**. +6. **Thriving Community**: Actively maintained by a vibrant developer community and the **#1 trending GitHub repository** across all languages. -## Try it Now! +## 🚀 Quick Start -✨ Play around with this [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing) +1. Install Crawl4AI: +```bash +pip install crawl4ai +``` -✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/) +2. Run a simple web crawl: +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CacheMode -## Features ✨ +async def main(): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun(url="https://www.nbcnews.com/business") + # Soone will be change to result.markdown + print(result.markdown_v2.raw_markdown) -
+if __name__ == "__main__": + asyncio.run(main()) +``` + +## ✨ Features + +
📝 Markdown Generation - 🧹 **Clean Markdown**: Generates clean, structured Markdown with accurate formatting. @@ -38,7 +61,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc - 📚 **BM25 Algorithm**: Employs BM25-based filtering for extracting core information and removing irrelevant content.
-
+
📊 Structured Data Extraction - 🤖 **LLM-Driven Extraction**: Supports all LLMs (open-source and proprietary) for structured data extraction. @@ -49,7 +72,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
-
+
🌐 Browser Integration - 🖥️ **Managed Browser**: Use user-owned browsers with full control, avoiding bot detection. @@ -61,7 +84,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
-
+
🔎 Crawling & Scraping - 🖼️ **Media Support**: Extract images, audio, videos, and responsive image formats like `srcset` and `picture`. @@ -76,7 +99,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
-
+
🚀 Deployment - 🐳 **Dockerized Setup**: Optimized Docker image with API server for easy deployment. @@ -99,7 +122,54 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
-## Installation 🛠️ + + +## Try it Now! + +✨ Play around with this [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing) + +✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/) + + +## 🚀 Speed Comparison + +A test was conducted on **[NBC News - Business Section](https://www.nbcnews.com/business)** to compare Crawl4AI and Firecrawl, highlighting Crawl4AI's speed, efficiency, and advanced features. + +
+📊 Results Summary + +#### Results Summary + +| **Method** | **Time Taken** | **Markdown Length** | **Fit Markdown** | **Images Found** | +|--------------------------------|----------------|----------------------|-------------------|------------------| +| **Firecrawl** | 6.04 seconds | 38,382 characters | - | 52 | +| **Crawl4AI (Simple Crawl)** | 1.06 seconds | 42,027 characters | - | 52 | +| **Crawl4AI (Markdown Plus)** | 1.30 seconds | 54,342 characters | 11,119 characters | 52 | +| **Crawl4AI (JavaScript)** | 1.56 seconds | 75,869 characters | 13,406 characters | 92 | + +
+ +
+Key Takeaways + +1. **Superior Speed**: Crawl4AI processes even advanced crawls up to **6x faster** than Firecrawl, with times as low as **1.06 seconds**. +2. **Rich Content Extraction**: Crawl4AI consistently captures more comprehensive content, producing a **Markdown Plus** output of **54,342 characters**, compared to Firecrawl's **38,382 characters**. +3. **AI-Optimized Output**: With **Fit Markdown**, Crawl4AI removes noise to produce concise, AI-friendly outputs (**11,119–13,406 characters**) tailored for LLM workflows. +4. **Dynamic Content Handling**: Using JavaScript execution, Crawl4AI extracted **92 images** and enriched content dynamically loaded via “Load More” buttons—unmatched by Firecrawl. + +
+ +
+🏁 Conclusion + +Crawl4AI outshines Firecrawl in speed, completeness, and flexibility. Its advanced features, including **Markdown Plus**, **Fit Markdown**, and **dynamic content handling**, make it the ideal choice for AI-ready web crawling. Whether you're targeting rich structured data or handling complex dynamic websites, Crawl4AI delivers unmatched performance and precision. + +You can find the full comparison code in our repository at [docs/examples/quickstart_async.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.py). + +
+ + +## 🛠️ Installation 🛠️ Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker. @@ -259,27 +329,14 @@ For advanced configuration, environment variables, and usage examples, see our [
-## Quick Start 🚀 -```python -import asyncio -from crawl4ai import AsyncWebCrawler, CacheMode -async def main(): - async with AsyncWebCrawler(verbose=True) as crawler: - result = await crawler.arun(url="https://www.nbcnews.com/business") - print(result.markdown_v2.raw_markdown) # Soone will be change to result.markdown - -if __name__ == "__main__": - asyncio.run(main()) -``` - -## Advanced Usage Examples 🔬 +## 🔬 Advanced Usage Examples 🔬 You can check the project structure in the directory [https://github.com/unclecode/crawl4ai/docs/examples](docs/examples). Over there, you can find a variety of examples; here, some popular examples are shared.
-🖥️ Heuristic Markdown Generation with Clean and Fit Markdown +📝 Heuristic Markdown Generation with Clean and Fit Markdown ```python import asyncio @@ -310,7 +367,7 @@ if __name__ == "__main__":
-🖥️ Structured Data Extraction and Executing JavaScript +🖥️ Executing JavaScript & Extract Structured Data without LLMs ```python import asyncio @@ -393,7 +450,7 @@ if __name__ == "__main__":
-🤖 Extracting Structured Data with LLMs +📚 Extracting Structured Data with LLMs ```python import os @@ -480,74 +537,43 @@ async def test_news_crawl():
-## Speed Comparison 🚀 -A test was conducted on **[NBC News - Business Section](https://www.nbcnews.com/business)** to compare Crawl4AI and Firecrawl, highlighting Crawl4AI's speed, efficiency, and advanced features. +## ✨ New in 0.3.743 ---- +- 🚀 **Improved ManagedBrowser Configuration**: Dynamic host and port support for more flexible browser management. +- 📝 **Enhanced Markdown Generation**: New generator class for better formatting and customization. +- ⚡ **Fast HTML Formatting**: Significantly optimized HTML formatting in the web crawler. +- 🛠️ **Utility & Sanitization Upgrades**: Improved sanitization and expanded utility functions for streamlined workflows. +- 👥 **Acknowledgments**: Added contributor details and pull request acknowledgments for better transparency. -#### Results Summary -| **Method** | **Time Taken** | **Markdown Length** | **Fit Markdown** | **Images Found** | -|--------------------------------|----------------|----------------------|-------------------|------------------| -| **Firecrawl** | 6.04 seconds | 38,382 characters | - | 52 | -| **Crawl4AI (Simple Crawl)** | 1.06 seconds | 42,027 characters | - | 52 | -| **Crawl4AI (Markdown Plus)** | 1.30 seconds | 54,342 characters | 11,119 characters | 52 | -| **Crawl4AI (JavaScript)** | 1.56 seconds | 75,869 characters | 13,406 characters | 92 | - ---- - -#### Key Takeaways - -1. **Superior Speed**: Crawl4AI processes even advanced crawls up to **6x faster** than Firecrawl, with times as low as **1.06 seconds**. -2. **Rich Content Extraction**: Crawl4AI consistently captures more comprehensive content, producing a **Markdown Plus** output of **54,342 characters**, compared to Firecrawl's **38,382 characters**. -3. **AI-Optimized Output**: With **Fit Markdown**, Crawl4AI removes noise to produce concise, AI-friendly outputs (**11,119–13,406 characters**) tailored for LLM workflows. -4. **Dynamic Content Handling**: Using JavaScript execution, Crawl4AI extracted **92 images** and enriched content dynamically loaded via “Load More” buttons—unmatched by Firecrawl. - ---- - -#### Conclusion - -Crawl4AI outshines Firecrawl in speed, completeness, and flexibility. Its advanced features, including **Markdown Plus**, **Fit Markdown**, and **dynamic content handling**, make it the ideal choice for AI-ready web crawling. Whether you're targeting rich structured data or handling complex dynamic websites, Crawl4AI delivers unmatched performance and precision. - -You can find the full comparison code in our repository at [docs/examples/quickstart_async.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.py). - -## Documentation 📚 +## 📖 Documentation & Roadmap For detailed documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/). -## Crawl4AI Roadmap 🗺️ +Moreover to check our development plans and upcoming features, check out our [Roadmap](https://github.com/unclecode/crawl4ai/blob/main/ROADMAP.md). -For detailed information on our development plans and upcoming features, check out our [Roadmap](https://github.com/unclecode/crawl4ai/blob/main/ROADMAP.md). - -### Advanced Crawling Systems 🔧 - [x] 0. Graph Crawler: Smart website traversal using graph search algorithms for comprehensive nested page extraction - [ ] 1. Question-Based Crawler: Natural language driven web discovery and content extraction - [ ] 2. Knowledge-Optimal Crawler: Smart crawling that maximizes knowledge while minimizing data extraction - [ ] 3. Agentic Crawler: Autonomous system for complex multi-step crawling operations - -### Specialized Features 🛠️ - [ ] 4. Automated Schema Generator: Convert natural language to extraction schemas - [ ] 5. Domain-Specific Scrapers: Pre-configured extractors for common platforms (academic, e-commerce) - [ ] 6. Web Embedding Index: Semantic search infrastructure for crawled content - -### Development Tools 🔨 - [ ] 7. Interactive Playground: Web UI for testing, comparing strategies with AI assistance - [ ] 8. Performance Monitor: Real-time insights into crawler operations - [ ] 9. Cloud Integration: One-click deployment solutions across cloud providers - -### Community & Growth 🌱 - [ ] 10. Sponsorship Program: Structured support system with tiered benefits - [ ] 11. Educational Content: "How to Crawl" video series and interactive tutorials -## Contributing 🤝 +## 🤝 Contributing We welcome contributions from the open-source community. Check out our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTING.md) for more information. -## License 📄 +## 📄 License Crawl4AI is released under the [Apache 2.0 License](https://github.com/unclecode/crawl4ai/blob/main/LICENSE). -## Contact 📧 +## 📧 Contact For questions, suggestions, or feedback, feel free to reach out: @@ -558,7 +584,7 @@ For questions, suggestions, or feedback, feel free to reach out: Happy Crawling! 🕸️🚀 -# Mission +## 🗾 Mission Our mission is to unlock the untapped potential of personal and enterprise data in the digital age. In today's world, individuals and organizations generate vast amounts of valuable digital footprints, yet this data remains largely uncapitalized as a true asset. @@ -570,13 +596,13 @@ This democratization of data represents the first step toward a shared data econ For a detailed exploration of our vision, opportunities, and pathway forward, please see our [full mission statement](./MISSION.md). -## Key Opportunities +### Key Opportunities - **Data Capitalization**: Transform digital footprints into valuable assets that can appear on personal and enterprise balance sheets - **Authentic Data**: Unlock the vast reservoir of real human insights and knowledge for AI advancement - **Shared Economy**: Create new value streams where data creators directly benefit from their contributions -## Development Pathway +### Development Pathway 1. **Open-Source Foundation**: Building transparent, community-driven data extraction tools 2. **Data Capitalization Platform**: Creating tools to structure and value digital assets From ddfb6707b47b6be786c2115cd7511b3d94d89e7c Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 16:34:08 +0800 Subject: [PATCH 20/70] docs: update README to reflect new branding and improve section headings for clarity --- README.md | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index c4ef1bd3..ed6892ec 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,4 @@ -# 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scraper - -[✨ Check out what's new in the latest update!](#new-in-03743) +# 🔥🕷️ Crawl4AI: Crawl Smarter, Faster, Freely. For AI. unclecode%2Fcrawl4ai | Trendshift @@ -11,11 +9,9 @@ [![GitHub Pull Requests](https://img.shields.io/github/issues-pr/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/pulls) [![License](https://img.shields.io/github/license/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/blob/main/LICENSE) -## 🔥 Crawl4AI: Crawl Smarter, Faster, Freely. For AI. - Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -[✨ Check out what's new in the latest update!](#new-in-03743) +[✨ Check out what's new in the latest update!](#recent-updates) ## 🧐 Why Crawl4AI? @@ -537,7 +533,7 @@ async def test_news_crawl():
-## ✨ New in 0.3.743 +## ✨ Recent Updates - 🚀 **Improved ManagedBrowser Configuration**: Dynamic host and port support for more flexible browser management. - 📝 **Enhanced Markdown Generation**: New generator class for better formatting and customization. From 3fda66b85b793655a92b3627599472f4d3279b0b Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 16:36:24 +0800 Subject: [PATCH 21/70] docs: refine README content for clarity and conciseness, improving descriptions and formatting --- README.md | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index ed6892ec..7bf4b4a4 100644 --- a/README.md +++ b/README.md @@ -15,13 +15,12 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant ## 🧐 Why Crawl4AI? -1. **Built for LLMs**: Creates **smart, concise Markdown** optimized for applications like Retrieval-Augmented Generation (RAG) and fine-tuning. -2. **Lightning Fast**: Delivers results **6x faster** than competitors with real-time, cost-efficient performance. -3. **Flexible Browser Control**: Offers session management, proxies, and custom hooks for precise, seamless data access. -4. **Heuristic Intelligence**: Leverages **advanced algorithms** to extract data efficiently, reducing reliance on costly language models. -5. **Open Source & Deployable**: 100% open-source with no API keys or registration required-ready for **Docker and cloud integration**. -6. **Thriving Community**: Actively maintained by a vibrant developer community and the **#1 trending GitHub repository** across all languages. - +1. **Built for LLMs**: Creates smart, concise Markdown optimized for RAG and fine-tuning applications. +2. **Lightning Fast**: Delivers results 6x faster with real-time, cost-efficient performance. +3. **Flexible Browser Control**: Offers session management, proxies, and custom hooks for seamless data access. +4. **Heuristic Intelligence**: Uses advanced algorithms for efficient extraction, reducing reliance on costly models. +5. **Open Source & Deployable**: Fully open-source with no API keys—ready for Docker and cloud integration. +6. **Thriving Community**: Actively maintained by a vibrant community and the #1 trending GitHub repository. ## 🚀 Quick Start @@ -145,7 +144,7 @@ A test was conducted on **[NBC News - Business Section](https://www.nbcnews.com/
-
+
Key Takeaways 1. **Superior Speed**: Crawl4AI processes even advanced crawls up to **6x faster** than Firecrawl, with times as low as **1.06 seconds**. @@ -155,7 +154,7 @@ A test was conducted on **[NBC News - Business Section](https://www.nbcnews.com/
-
+
🏁 Conclusion Crawl4AI outshines Firecrawl in speed, completeness, and flexibility. Its advanced features, including **Markdown Plus**, **Fit Markdown**, and **dynamic content handling**, make it the ideal choice for AI-ready web crawling. Whether you're targeting rich structured data or handling complex dynamic websites, Crawl4AI delivers unmatched performance and precision. @@ -169,7 +168,7 @@ You can find the full comparison code in our repository at [docs/examples/quicks Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker. -
+
🐍 Using pip Choose the installation option that best fits your needs: @@ -234,7 +233,7 @@ pip install -e ".[all]" # Install all optional features
-
+
🚀 One-Click Deployment Deploy your own instance of Crawl4AI with one click: @@ -251,7 +250,7 @@ The deploy will:
-
+
🐳 Using Docker Crawl4AI is available as Docker images for easy deployment. You can either pull directly from Docker Hub (recommended) or build from the repository. @@ -325,13 +324,11 @@ For advanced configuration, environment variables, and usage examples, see our [
- - ## 🔬 Advanced Usage Examples 🔬 You can check the project structure in the directory [https://github.com/unclecode/crawl4ai/docs/examples](docs/examples). Over there, you can find a variety of examples; here, some popular examples are shared. -
+
📝 Heuristic Markdown Generation with Clean and Fit Markdown ```python @@ -362,7 +359,7 @@ if __name__ == "__main__":
-
+
🖥️ Executing JavaScript & Extract Structured Data without LLMs ```python @@ -445,7 +442,7 @@ if __name__ == "__main__":
-
+
📚 Extracting Structured Data with LLMs ```python @@ -485,7 +482,7 @@ if __name__ == "__main__":
-
+
🤖 Using You own Browswer with Custome User Profile ```python From efe93a5f57ebe677cc12dca90549525626a85b98 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 16:41:11 +0800 Subject: [PATCH 22/70] docs: enhance README with development TODOs and refine mission statement for clarity --- README.md | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 7bf4b4a4..20395b58 100644 --- a/README.md +++ b/README.md @@ -545,6 +545,9 @@ For detailed documentation, including installation instructions, advanced featur Moreover to check our development plans and upcoming features, check out our [Roadmap](https://github.com/unclecode/crawl4ai/blob/main/ROADMAP.md). +
+📈 Development TODOs + - [x] 0. Graph Crawler: Smart website traversal using graph search algorithms for comprehensive nested page extraction - [ ] 1. Question-Based Crawler: Natural language driven web discovery and content extraction - [ ] 2. Knowledge-Optimal Crawler: Smart crawling that maximizes knowledge while minimizing data extraction @@ -558,6 +561,8 @@ Moreover to check our development plans and upcoming features, check out our [Ro - [ ] 10. Sponsorship Program: Structured support system with tiered benefits - [ ] 11. Educational Content: "How to Crawl" video series and interactive tutorials +
+ ## 🤝 Contributing We welcome contributions from the open-source community. Check out our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTING.md) for more information. @@ -576,32 +581,32 @@ For questions, suggestions, or feedback, feel free to reach out: Happy Crawling! 🕸️🚀 - ## 🗾 Mission -Our mission is to unlock the untapped potential of personal and enterprise data in the digital age. In today's world, individuals and organizations generate vast amounts of valuable digital footprints, yet this data remains largely uncapitalized as a true asset. +Our mission is to unlock the value of personal and enterprise data by transforming digital footprints into structured, tradeable assets. Crawl4AI empowers individuals and organizations with open-source tools to extract and structure data, fostering a shared data economy. -Our open-source solution empowers developers and innovators to build tools for data extraction and structuring, laying the foundation for a new era of data ownership. By transforming personal and enterprise data into structured, tradeable assets, we're creating opportunities for individuals to capitalize on their digital footprints and for organizations to unlock the value of their collective knowledge. +We envision a future where AI is powered by real human knowledge, ensuring data creators directly benefit from their contributions. By democratizing data and enabling ethical sharing, we are laying the foundation for authentic AI advancement. -This democratization of data represents the first step toward a shared data economy, where willing participation in data sharing drives AI advancement while ensuring the benefits flow back to data creators. Through this approach, we're building a future where AI development is powered by authentic human knowledge rather than synthetic alternatives. +
+🔑 Key Opportunities + +- **Data Capitalization**: Transform digital footprints into measurable, valuable assets. +- **Authentic AI Data**: Provide AI systems with real human insights. +- **Shared Economy**: Create a fair data marketplace that benefits data creators. -![Mission Diagram](./docs/assets/pitch-dark.svg) +
-For a detailed exploration of our vision, opportunities, and pathway forward, please see our [full mission statement](./MISSION.md). +
+🚀 Development Pathway -### Key Opportunities +1. **Open-Source Tools**: Community-driven platforms for transparent data extraction. +2. **Digital Asset Structuring**: Tools to organize and value digital knowledge. +3. **Ethical Data Marketplace**: A secure, fair platform for exchanging structured data. -- **Data Capitalization**: Transform digital footprints into valuable assets that can appear on personal and enterprise balance sheets -- **Authentic Data**: Unlock the vast reservoir of real human insights and knowledge for AI advancement -- **Shared Economy**: Create new value streams where data creators directly benefit from their contributions +For more details, see our [full mission statement](./MISSION.md). +
-### Development Pathway -1. **Open-Source Foundation**: Building transparent, community-driven data extraction tools -2. **Data Capitalization Platform**: Creating tools to structure and value digital assets -3. **Shared Data Marketplace**: Establishing an economic platform for ethical data exchange - -For a detailed exploration of our vision, challenges, and solutions, please see our [full mission statement](./MISSION.md). ## Star History From 0bccf23db3f90bf07342f34591c91b92eb1cdf89 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 18:19:42 +0800 Subject: [PATCH 23/70] docs: update quickstart_async.py to enable example function calls for better demonstration --- docs/examples/quickstart_async.py | 36 +++++++++++++++---------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index e50fe456..9f1eff53 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -534,31 +534,31 @@ async def fit_markdown_remove_overlay(): async def main(): - # await simple_crawl() - # await simple_example_with_running_js_code() - # await simple_example_with_css_selector() - # await use_proxy() - # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) - # await extract_structured_data_using_css_extractor() + await simple_crawl() + await simple_example_with_running_js_code() + await simple_example_with_css_selector() + await use_proxy() + await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) + await extract_structured_data_using_css_extractor() - # # LLM extraction examples + # LLM extraction examples # await extract_structured_data_using_llm() # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) - # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) # await extract_structured_data_using_llm("ollama/llama3.2") + await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) - # # You always can pass custom headers to the extraction strategy - # custom_headers = { - # "Authorization": "Bearer your-custom-token", - # "X-Custom-Header": "Some-Value" - # } - # await extract_structured_data_using_llm(extra_headers=custom_headers) + # You always can pass custom headers to the extraction strategy + custom_headers = { + "Authorization": "Bearer your-custom-token", + "X-Custom-Header": "Some-Value" + } + await extract_structured_data_using_llm(extra_headers=custom_headers) - # # await crawl_dynamic_content_pages_method_1() - # # await crawl_dynamic_content_pages_method_2() - # await crawl_dynamic_content_pages_method_3() + # await crawl_dynamic_content_pages_method_1() + # await crawl_dynamic_content_pages_method_2() + await crawl_dynamic_content_pages_method_3() - # await crawl_custom_browser_type() + await crawl_custom_browser_type() await speed_comparison() From a036b7f12224d6a424118e3d113e49ab1e2c9e13 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 19:24:07 +0800 Subject: [PATCH 24/70] feat: implement create_box_message utility for formatted error messages and enhance error logging in AsyncWebCrawler --- crawl4ai/async_crawler_strategy.py | 21 +++++----- crawl4ai/async_webcrawler.py | 8 ++-- crawl4ai/utils.py | 64 ++++++++++++++++++++++++++++-- 3 files changed, 77 insertions(+), 16 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 882f9a50..e5316187 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -15,7 +15,7 @@ import hashlib import json import uuid from .models import AsyncCrawlResponse - +from .utils import create_box_message from playwright_stealth import StealthConfig, stealth_async stealth_config = StealthConfig( @@ -321,10 +321,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "--disable-infobars", "--window-position=0,0", "--ignore-certificate-errors", - "--ignore-certificate-errors-spki-list", + "--ignore-certificate-errors-spki-list" ] } - + # Add channel if specified (try Chrome first) if self.chrome_channel: browser_args["channel"] = self.chrome_channel @@ -765,12 +765,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.execute_hook('before_goto', page, context = context) - response = await page.goto( - url, - # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), - wait_until=kwargs.get("wait_until", "domcontentloaded"), - timeout=kwargs.get("page_timeout", 60000) - ) + try: + response = await page.goto( + url, + # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), + wait_until=kwargs.get("wait_until", "domcontentloaded"), + timeout=kwargs.get("page_timeout", 60000), + ) + except Error as e: + raise RuntimeError(f"Failed on navigating ACS-GOTO :\n{str(e)}") # response = await page.goto("about:blank") # await page.evaluate(f"window.location.href = '{url}'") diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 5a46fe39..66b4c21b 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -26,8 +26,10 @@ from .utils import ( sanitize_input_encode, InvalidCSSSelectorError, format_html, - fast_format_html + fast_format_html, + create_box_message ) + from urllib.parse import urlparse import random from .__version__ import __version__ as crawl4ai_version @@ -326,15 +328,15 @@ class AsyncWebCrawler: if not hasattr(e, "msg"): e.msg = str(e) # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") + self.logger.error_status( url=cache_context.display_url, - error=e.msg, + error=create_box_message(e.msg, type = "error"), tag="ERROR" ) return CrawlResult( url=url, html="", - markdown=f"[ERROR] 🚫 arun(): Failed to crawl {cache_context.display_url}, error: {e.msg}", success=False, error_message=e.msg ) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index aaf27e91..253ec079 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -17,7 +17,8 @@ from requests.exceptions import InvalidSchema import hashlib from typing import Optional, Tuple, Dict, Any import xxhash - +from colorama import Fore, Style, init +import textwrap from .html2text import HTML2Text class CustomHTML2Text(HTML2Text): @@ -103,12 +104,67 @@ class CustomHTML2Text(HTML2Text): self.preserved_content.append(data) return super().handle_data(data, entity_char) - - - class InvalidCSSSelectorError(Exception): pass + +def create_box_message( + message: str, + type: str = "info", + width: int = 80, + add_newlines: bool = True, + double_line: bool = False +) -> str: + init() + + # Define border and text colors for different types + styles = { + "warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, "⚠"), + "info": (Fore.BLUE, Fore.LIGHTBLUE_EX, "ℹ"), + "success": (Fore.GREEN, Fore.LIGHTGREEN_EX, "✓"), + "error": (Fore.RED, Fore.LIGHTRED_EX, "×"), + } + + border_color, text_color, prefix = styles.get(type.lower(), styles["info"]) + + # Define box characters based on line style + box_chars = { + "single": ("─", "│", "┌", "┐", "└", "┘"), + "double": ("═", "║", "╔", "╗", "╚", "╝") + } + line_style = "double" if double_line else "single" + h_line, v_line, tl, tr, bl, br = box_chars[line_style] + + # Process lines with lighter text color + formatted_lines = [] + raw_lines = message.split('\n') + + if raw_lines: + first_line = f"{prefix} {raw_lines[0].strip()}" + wrapped_first = textwrap.fill(first_line, width=width-4) + formatted_lines.extend(wrapped_first.split('\n')) + + for line in raw_lines[1:]: + if line.strip(): + wrapped = textwrap.fill(f" {line.strip()}", width=width-4) + formatted_lines.extend(wrapped.split('\n')) + else: + formatted_lines.append("") + + # Create the box with colored borders and lighter text + horizontal_line = h_line * (width - 1) + box = [ + f"{border_color}{tl}{horizontal_line}{tr}", + *[f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}" for line in formatted_lines], + f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}" + ] + + result = "\n".join(box) + if add_newlines: + result = f"\n{result}\n" + + return result + def calculate_semaphore_count(): cpu_count = os.cpu_count() memory_gb = get_system_memory() / (1024 ** 3) # Convert to GB From a9b6b6523812333400fd66730ce3e3c184ad79e2 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 19:26:50 +0800 Subject: [PATCH 25/70] chore: update version to 0.3.744 and add publish.sh to .gitignore --- .gitignore | 1 + crawl4ai/__version__.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 8e96fa82..52e25a2a 100644 --- a/.gitignore +++ b/.gitignore @@ -214,3 +214,4 @@ git_issues.md todo_executor.md protect-all-except-feature.sh manage-collab.sh +publish.sh \ No newline at end of file diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 073b371c..e38cc61b 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.743" +__version__ = "0.3.744" From b14e83f49951cba097e67464546ba2b4f2787cdc Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 19:31:09 +0800 Subject: [PATCH 26/70] docs: fix link formatting for recent updates section in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d22d8940..26cc9fcc 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -[✨ Check out what's new in the latest update!](#recent-updates) +[✨ Check out what's new in the latest update!](#-recent-updates) ## 🧐 Why Crawl4AI? From 776efa74a4c9fde71377f986cc69b201632a59c0 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 19:32:32 +0800 Subject: [PATCH 27/70] docs: fix link formatting for recent updates section in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 26cc9fcc..01197868 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -[✨ Check out what's new in the latest update!](#-recent-updates) +[✨ Check out what's new in the latest update!](#--recent-updates) ## 🧐 Why Crawl4AI? From 48d43c14b1864b87866e8114f5c4fc6e415b6e51 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 19:33:02 +0800 Subject: [PATCH 28/70] docs: fix link formatting for recent updates section in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 01197868..26cc9fcc 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -[✨ Check out what's new in the latest update!](#--recent-updates) +[✨ Check out what's new in the latest update!](#-recent-updates) ## 🧐 Why Crawl4AI? From 9221c08418bbfaa0d0cf48b4f933e3a2ae722f3a Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 19:33:36 +0800 Subject: [PATCH 29/70] docs: fix link formatting for recent updates section in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d22d8940..26cc9fcc 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -[✨ Check out what's new in the latest update!](#recent-updates) +[✨ Check out what's new in the latest update!](#-recent-updates) ## 🧐 Why Crawl4AI? From cf35cbe59e39222b8e3c76ffadc67a7fea55df7a Mon Sep 17 00:00:00 2001 From: Paulo Kuong Date: Thu, 28 Nov 2024 06:46:36 -0500 Subject: [PATCH 30/70] CRAWL4_AI_BASE_DIRECTORY should be Path object instead of string (#298) Thank you so much for your point. Yes, that's correct. I accept your pull request, and I add your name to a contribution list. Thank you again. --- setup.py | 50 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/setup.py b/setup.py index f5f3cf2d..796c3bf9 100644 --- a/setup.py +++ b/setup.py @@ -9,10 +9,16 @@ import asyncio # Create the .crawl4ai folder in the user's home directory if it doesn't exist # If the folder already exists, remove the cache folder -crawl4ai_folder = os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()) / ".crawl4ai" +crawl4ai_folder = Path(os.getenv("CRAWL4_AI_BASE_DIRECTORY")) or Path.home() +crawl4ai_folder = crawl4ai_folder / ".crawl4ai" cache_folder = crawl4ai_folder / "cache" -content_folders = ['html_content', 'cleaned_html', 'markdown_content', - 'extracted_content', 'screenshots'] +content_folders = [ + "html_content", + "cleaned_html", + "markdown_content", + "extracted_content", + "screenshots", +] # Clean up old cache if exists if cache_folder.exists(): @@ -28,7 +34,7 @@ for folder in content_folders: __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) with open(os.path.join(__location__, "requirements.txt")) as f: requirements = f.read().splitlines() - + with open("crawl4ai/__version__.py") as f: for line in f: if line.startswith("__version__"): @@ -37,11 +43,12 @@ with open("crawl4ai/__version__.py") as f: # Define requirements default_requirements = requirements -torch_requirements = ["torch", "nltk", "scikit-learn"] +torch_requirements = ["torch", "nltk", "scikit-learn"] transformer_requirements = ["transformers", "tokenizers"] -cosine_similarity_requirements = ["torch", "transformers", "nltk" ] +cosine_similarity_requirements = ["torch", "transformers", "nltk"] sync_requirements = ["selenium"] + def install_playwright(): print("Installing Playwright browsers...") try: @@ -49,16 +56,22 @@ def install_playwright(): print("Playwright installation completed successfully.") except subprocess.CalledProcessError as e: print(f"Error during Playwright installation: {e}") - print("Please run 'python -m playwright install' manually after the installation.") + print( + "Please run 'python -m playwright install' manually after the installation." + ) except Exception as e: print(f"Unexpected error during Playwright installation: {e}") - print("Please run 'python -m playwright install' manually after the installation.") + print( + "Please run 'python -m playwright install' manually after the installation." + ) + def run_migration(): """Initialize database during installation""" try: print("Starting database initialization...") from crawl4ai.async_database import async_db_manager + asyncio.run(async_db_manager.initialize()) print("Database initialization completed successfully.") except ImportError: @@ -67,12 +80,14 @@ def run_migration(): print(f"Warning: Database initialization failed: {e}") print("Database will be initialized on first use") + class PostInstallCommand(install): def run(self): install.run(self) install_playwright() # run_migration() + setup( name="Crawl4AI", version=version, @@ -84,18 +99,23 @@ setup( author_email="unclecode@kidocode.com", license="MIT", packages=find_packages(), - install_requires=default_requirements + ["playwright", "aiofiles"], # Added aiofiles + install_requires=default_requirements + + ["playwright", "aiofiles"], # Added aiofiles extras_require={ "torch": torch_requirements, "transformer": transformer_requirements, "cosine": cosine_similarity_requirements, "sync": sync_requirements, - "all": default_requirements + torch_requirements + transformer_requirements + cosine_similarity_requirements + sync_requirements, + "all": default_requirements + + torch_requirements + + transformer_requirements + + cosine_similarity_requirements + + sync_requirements, }, entry_points={ - 'console_scripts': [ - 'crawl4ai-download-models=crawl4ai.model_loader:main', - 'crawl4ai-migrate=crawl4ai.migrations:main', # Added migration command + "console_scripts": [ + "crawl4ai-download-models=crawl4ai.model_loader:main", + "crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command ], }, classifiers=[ @@ -110,6 +130,6 @@ setup( ], python_requires=">=3.7", cmdclass={ - 'install': PostInstallCommand, + "install": PostInstallCommand, }, -) \ No newline at end of file +) From 1d83c493aff8672c9da471c222f60c5c72145b71 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 19:58:40 +0800 Subject: [PATCH 31/70] Enhance setup process and update contributors list - Acknowledge contributor paulokuong for fixing RAWL4_AI_BASE_DIRECTORY issue - Refine base directory handling in `setup.py` - Clarify Playwright installation instructions and improve error handling --- CONTRIBUTORS.md | 1 + setup.py | 48 ++++++++++++++++++++++++++++++++++-------------- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index deb46a9c..663e5541 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -21,6 +21,7 @@ We would like to thank the following people for their contributions to Crawl4AI: - [nelzomal](https://github.com/nelzomal) - Enhance development installation instructions [#286](https://github.com/unclecode/crawl4ai/pull/286) - [HamzaFarhan](https://github.com/HamzaFarhan) - Handled the cases where markdown_with_citations, references_markdown, and filtered_html might not be defined [#293](https://github.com/unclecode/crawl4ai/pull/293) - [NanmiCoder](https://github.com/NanmiCoder) - fix: crawler strategy exception handling and fixes [#271](https://github.com/unclecode/crawl4ai/pull/271) +- [paulokuong](https://github.com/paulokuong) - fix: RAWL4_AI_BASE_DIRECTORY should be Path object instead of string [#298](https://github.com/unclecode/crawl4ai/pull/298) ## Other Contributors diff --git a/setup.py b/setup.py index f5f3cf2d..dbb07410 100644 --- a/setup.py +++ b/setup.py @@ -9,10 +9,16 @@ import asyncio # Create the .crawl4ai folder in the user's home directory if it doesn't exist # If the folder already exists, remove the cache folder -crawl4ai_folder = os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()) / ".crawl4ai" +crawl4ai_folder = Path(os.getenv("CRAWL4_AI_BASE_DIRECTORY")) or Path.home() +crawl4ai_folder = crawl4ai_folder / ".crawl4ai" cache_folder = crawl4ai_folder / "cache" -content_folders = ['html_content', 'cleaned_html', 'markdown_content', - 'extracted_content', 'screenshots'] +content_folders = [ + "html_content", + "cleaned_html", + "markdown_content", + "extracted_content", + "screenshots", +] # Clean up old cache if exists if cache_folder.exists(): @@ -28,7 +34,7 @@ for folder in content_folders: __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) with open(os.path.join(__location__, "requirements.txt")) as f: requirements = f.read().splitlines() - + with open("crawl4ai/__version__.py") as f: for line in f: if line.startswith("__version__"): @@ -37,11 +43,12 @@ with open("crawl4ai/__version__.py") as f: # Define requirements default_requirements = requirements -torch_requirements = ["torch", "nltk", "scikit-learn"] +torch_requirements = ["torch", "nltk", "scikit-learn"] transformer_requirements = ["transformers", "tokenizers"] -cosine_similarity_requirements = ["torch", "transformers", "nltk" ] +cosine_similarity_requirements = ["torch", "transformers", "nltk"] sync_requirements = ["selenium"] + def install_playwright(): print("Installing Playwright browsers...") try: @@ -49,16 +56,22 @@ def install_playwright(): print("Playwright installation completed successfully.") except subprocess.CalledProcessError as e: print(f"Error during Playwright installation: {e}") - print("Please run 'python -m playwright install' manually after the installation.") + print( + "Please run 'python -m playwright install' manually after the installation." + ) except Exception as e: print(f"Unexpected error during Playwright installation: {e}") - print("Please run 'python -m playwright install' manually after the installation.") + print( + "Please run 'python -m playwright install' manually after the installation." + ) + def run_migration(): """Initialize database during installation""" try: print("Starting database initialization...") from crawl4ai.async_database import async_db_manager + asyncio.run(async_db_manager.initialize()) print("Database initialization completed successfully.") except ImportError: @@ -67,12 +80,14 @@ def run_migration(): print(f"Warning: Database initialization failed: {e}") print("Database will be initialized on first use") + class PostInstallCommand(install): def run(self): install.run(self) install_playwright() # run_migration() + setup( name="Crawl4AI", version=version, @@ -84,18 +99,23 @@ setup( author_email="unclecode@kidocode.com", license="MIT", packages=find_packages(), - install_requires=default_requirements + ["playwright", "aiofiles"], # Added aiofiles + install_requires=default_requirements + + ["playwright", "aiofiles"], # Added aiofiles extras_require={ "torch": torch_requirements, "transformer": transformer_requirements, "cosine": cosine_similarity_requirements, "sync": sync_requirements, - "all": default_requirements + torch_requirements + transformer_requirements + cosine_similarity_requirements + sync_requirements, + "all": default_requirements + + torch_requirements + + transformer_requirements + + cosine_similarity_requirements + + sync_requirements, }, entry_points={ - 'console_scripts': [ - 'crawl4ai-download-models=crawl4ai.model_loader:main', - 'crawl4ai-migrate=crawl4ai.migrations:main', # Added migration command + "console_scripts": [ + "crawl4ai-download-models=crawl4ai.model_loader:main", + "crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command ], }, classifiers=[ @@ -110,6 +130,6 @@ setup( ], python_requires=">=3.7", cmdclass={ - 'install': PostInstallCommand, + "install": PostInstallCommand, }, ) \ No newline at end of file From 652d396a818a01d9673920da8c1a2d166f0d23f1 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 20:00:29 +0800 Subject: [PATCH 32/70] chore: update version to 0.3.745 --- crawl4ai/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index e38cc61b..8b69d491 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.744" +__version__ = "0.3.745" From 7d81c17cca98b720d06743d6398d1184350ccc75 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 20:02:39 +0800 Subject: [PATCH 33/70] fix: improve handling of CRAWL4_AI_BASE_DIRECTORY environment variable in setup.py --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index dbb07410..d891ff9f 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,8 @@ import asyncio # Create the .crawl4ai folder in the user's home directory if it doesn't exist # If the folder already exists, remove the cache folder -crawl4ai_folder = Path(os.getenv("CRAWL4_AI_BASE_DIRECTORY")) or Path.home() +base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY") +crawl4ai_folder = Path(base_dir) if base_dir else Path.home() crawl4ai_folder = crawl4ai_folder / ".crawl4ai" cache_folder = crawl4ai_folder / "cache" content_folders = [ From c8485776fe2e475bbba1f8ee513679999283441c Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 20:04:16 +0800 Subject: [PATCH 34/70] docs: update README to reflect latest version v0.3.745 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 26cc9fcc..e8e6cddf 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -[✨ Check out what's new in the latest update!](#-recent-updates) +[✨ Check out latest update v0.3.745](#-recent-updates) ## 🧐 Why Crawl4AI? From c0e87abaee97e9e206eb787f8939fdf8790f4a2b Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 21:43:08 +0800 Subject: [PATCH 35/70] fix: update package versions in requirements.txt for compatibility --- requirements.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index ed259ac9..c0f6f183 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,16 @@ aiosqlite~=0.20 html2text~=2024.2 lxml~=5.3 -litellm~=1.48 +litellm>=1.53.1 numpy>=1.26.0,<3 pillow~=10.4 -playwright>=1.47,<1.48 +playwright>=1.49.0 python-dotenv~=1.0 requests~=2.26 beautifulsoup4~=4.12 -tf-playwright-stealth~=1.0 +tf-playwright-stealth>=1.1.0 xxhash~=3.4 rank-bm25~=0.2 -aiofiles~=24.0 +aiofiles>=24.1.0 colorama~=0.4 snowballstemmer~=2.2 \ No newline at end of file From b0419edda6c0a25da82f65f557beee4e0a3daf02 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 02:31:17 +0800 Subject: [PATCH 36/70] Update README.md (#300) --- README.md | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/README.md b/README.md index e8e6cddf..c9d92e17 100644 --- a/README.md +++ b/README.md @@ -125,34 +125,6 @@ if __name__ == "__main__": ✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/) -## Features ✨ - -- 🆓 Completely free and open-source -- 🚀 Blazing fast performance, outperforming many paid services -- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown) -- 🌐 Multi-browser support (Chromium, Firefox, WebKit) -- 🌍 Supports crawling multiple URLs simultaneously -- 🎨 Extracts and returns all media tags (Images, Audio, and Video) -- 🔗 Extracts all external and internal links -- 📚 Extracts metadata from the page -- 🔄 Custom hooks for authentication, headers, and page modifications -- 🕵️ User-agent customization -- 🖼️ Takes screenshots of pages with enhanced error handling -- 📜 Executes multiple custom JavaScripts before crawling -- 📊 Generates structured output without LLM using JsonCssExtractionStrategy -- 📚 Various chunking strategies: topic-based, regex, sentence, and more -- 🧠 Advanced extraction strategies: cosine clustering, LLM, and more -- 🎯 CSS selector support for precise data extraction -- 📝 Passes instructions/keywords to refine extraction -- 🔒 Proxy support with authentication for enhanced access -- 🔄 Session management for complex multi-page crawling -- 🌐 Asynchronous architecture for improved performance -- 🖼️ Improved image processing with lazy-loading detection -- 🕰️ Enhanced handling of delayed content loading -- 🔑 Custom headers support for LLM interactions -- 🖼️ iframe content extraction for comprehensive analysis -- ⏱️ Flexible timeout and delayed content retrieval options - ## Installation 🛠️ Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker. From 449dd7cc0b9d81e0f602b3868b478c8515a45bf1 Mon Sep 17 00:00:00 2001 From: unclecode Date: Fri, 29 Nov 2024 14:45:04 +0800 Subject: [PATCH 37/70] Migrating from the classic setup.py to a using PyProject approach. --- MANIFEST.in | 1 - build_hooks.py | 48 +++++++++++ docs/examples/quickstart_async.py | 128 +++++++++++++++++----------- plugin.py | 9 ++ post_install.py | 19 +++++ pyproject.toml | 75 ++++++++++++++++ requirements.txt | 16 ---- setup.cfg | 2 - setup.py | 136 ------------------------------ 9 files changed, 229 insertions(+), 205 deletions(-) delete mode 100644 MANIFEST.in create mode 100644 build_hooks.py create mode 100644 plugin.py create mode 100644 post_install.py create mode 100644 pyproject.toml delete mode 100644 requirements.txt delete mode 100644 setup.cfg delete mode 100644 setup.py diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 540b7204..00000000 --- a/MANIFEST.in +++ /dev/null @@ -1 +0,0 @@ -include requirements.txt \ No newline at end of file diff --git a/build_hooks.py b/build_hooks.py new file mode 100644 index 00000000..e59b5910 --- /dev/null +++ b/build_hooks.py @@ -0,0 +1,48 @@ +import os +import shutil +from pathlib import Path +import subprocess +import sys +from hatchling.builders.hooks.plugin.interface import BuildHookInterface +PLUGIN = "CustomBuildHook" + +class CustomBuildHook(BuildHookInterface): + def initialize(self, version, build_data): + # Create the .crawl4ai folder structure + base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY") + crawl4ai_folder = Path(base_dir) if base_dir else Path.home() + crawl4ai_folder = crawl4ai_folder / ".crawl4ai" + cache_folder = crawl4ai_folder / "cache" + content_folders = [ + "html_content", + "cleaned_html", + "markdown_content", + "extracted_content", + "screenshots", + ] + + # Clean up old cache if exists + if cache_folder.exists(): + shutil.rmtree(cache_folder) + + # Create new folder structure + crawl4ai_folder.mkdir(exist_ok=True) + cache_folder.mkdir(exist_ok=True) + for folder in content_folders: + (crawl4ai_folder / folder).mkdir(exist_ok=True) + + # Install Playwright browsers + try: + subprocess.check_call([sys.executable, "-m", "playwright", "install"]) + except Exception as e: + print(f"Warning: Playwright installation failed: {e}") + print("Please run 'python -m playwright install' manually after installation") + + # Initialize database + try: + from crawl4ai.async_database import async_db_manager + import asyncio + asyncio.run(async_db_manager.initialize()) + except Exception as e: + print(f"Warning: Database initialization failed: {e}") + print("Database will be initialized on first use") \ No newline at end of file diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 9f1eff53..01f7677c 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -32,7 +32,7 @@ print("Website: https://crawl4ai.com") async def simple_crawl(): print("\n--- Basic Usage ---") async with AsyncWebCrawler(verbose=True) as crawler: - result = await crawler.arun(url="https://www.nbcnews.com/business") + result = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode= CacheMode.BYPASS) print(result.markdown[:500]) # Print first 500 characters async def simple_example_with_running_js_code(): @@ -76,16 +76,17 @@ async def use_proxy(): async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler: result = await crawler.arun( url="https://www.nbcnews.com/business", - bypass_cache=True + cache_mode= CacheMode.BYPASS ) - print(result.markdown[:500]) # Print first 500 characters + if result.success: + print(result.markdown[:500]) # Print first 500 characters async def capture_and_save_screenshot(url: str, output_path: str): async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( url=url, screenshot=True, - bypass_cache=True + cache_mode= CacheMode.BYPASS ) if result.success and result.screenshot: @@ -141,41 +142,68 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None async def extract_structured_data_using_css_extractor(): print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") schema = { - "name": "Coinbase Crypto Prices", - "baseSelector": ".cds-tableRow-t45thuk", - "fields": [ - { - "name": "crypto", - "selector": "td:nth-child(1) h2", - "type": "text", - }, - { - "name": "symbol", - "selector": "td:nth-child(1) p", - "type": "text", - }, - { - "name": "price", - "selector": "td:nth-child(2)", - "type": "text", + "name": "KidoCode Courses", + "baseSelector": "section.charge-methodology .w-tab-content > div", + "fields": [ + { + "name": "section_title", + "selector": "h3.heading-50", + "type": "text", + }, + { + "name": "section_description", + "selector": ".charge-content", + "type": "text", + }, + { + "name": "course_name", + "selector": ".text-block-93", + "type": "text", + }, + { + "name": "course_description", + "selector": ".course-content-text", + "type": "text", + }, + { + "name": "course_icon", + "selector": ".image-92", + "type": "attribute", + "attribute": "src" + } + ] +} + + async with AsyncWebCrawler( + headless=True, + verbose=True + ) as crawler: + + # Create the JavaScript that handles clicking multiple times + js_click_tabs = """ + (async () => { + const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div"); + + for(let tab of tabs) { + // scroll to the tab + tab.scrollIntoView(); + tab.click(); + // Wait for content to load and animations to complete + await new Promise(r => setTimeout(r, 500)); } - ], - } + })(); + """ - extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) - - async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( - url="https://www.coinbase.com/explore", - extraction_strategy=extraction_strategy, - cache_mode=CacheMode.BYPASS, + url="https://www.kidocode.com/degrees/technology", + extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True), + js_code=[js_click_tabs], + cache_mode=CacheMode.BYPASS ) - assert result.success, "Failed to crawl the page" - - news_teasers = json.loads(result.extracted_content) - print(f"Successfully extracted {len(news_teasers)} news teasers") - print(json.dumps(news_teasers[0], indent=2)) + companies = json.loads(result.extracted_content) + print(f"Successfully extracted {len(companies)} companies") + print(json.dumps(companies[0], indent=2)) # Advanced Session-Based Crawling with Dynamic Content 🔄 async def crawl_dynamic_content_pages_method_1(): @@ -363,21 +391,21 @@ async def crawl_custom_browser_type(): # Use Firefox start = time.time() async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS) print(result.markdown[:500]) print("Time taken: ", time.time() - start) # Use WebKit start = time.time() async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS) print(result.markdown[:500]) print("Time taken: ", time.time() - start) # Use Chromium (default) start = time.time() async with AsyncWebCrawler(verbose=True, headless = True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS) print(result.markdown[:500]) print("Time taken: ", time.time() - start) @@ -534,29 +562,29 @@ async def fit_markdown_remove_overlay(): async def main(): - await simple_crawl() - await simple_example_with_running_js_code() - await simple_example_with_css_selector() - await use_proxy() - await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) - await extract_structured_data_using_css_extractor() + # await simple_crawl() + # await simple_example_with_running_js_code() + # await simple_example_with_css_selector() + # await use_proxy() + # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) + # await extract_structured_data_using_css_extractor() # LLM extraction examples # await extract_structured_data_using_llm() # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) # await extract_structured_data_using_llm("ollama/llama3.2") - await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) + # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) # You always can pass custom headers to the extraction strategy - custom_headers = { - "Authorization": "Bearer your-custom-token", - "X-Custom-Header": "Some-Value" - } - await extract_structured_data_using_llm(extra_headers=custom_headers) + # custom_headers = { + # "Authorization": "Bearer your-custom-token", + # "X-Custom-Header": "Some-Value" + # } + # await extract_structured_data_using_llm(extra_headers=custom_headers) # await crawl_dynamic_content_pages_method_1() # await crawl_dynamic_content_pages_method_2() - await crawl_dynamic_content_pages_method_3() + # await crawl_dynamic_content_pages_method_3() await crawl_custom_browser_type() diff --git a/plugin.py b/plugin.py new file mode 100644 index 00000000..1e1b11bf --- /dev/null +++ b/plugin.py @@ -0,0 +1,9 @@ +from colorama import Fore, Style +import subprocess +import sys + +def post_install(): + print(f"\n{Fore.YELLOW}{'='*40}") + print(f"{Fore.RED}IMPORTANT: Run this command now:") + print(f"{Fore.GREEN}python -m playwright install") + print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n") \ No newline at end of file diff --git a/post_install.py b/post_install.py new file mode 100644 index 00000000..e536e547 --- /dev/null +++ b/post_install.py @@ -0,0 +1,19 @@ +from colorama import Fore, Style +import subprocess +import sys +import distutils.log as log +from pathlib import Path + +def main(): + try: + subprocess.check_call([sys.executable, "-m", "playwright", "install"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL) + except: + print(f"\n{Fore.YELLOW}{'='*40}") + print(f"{Fore.RED}IMPORTANT: Run this command now:") + print(f"{Fore.GREEN}python -m playwright install") + print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..cfef8101 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,75 @@ +[build-system] +requires = ["hatchling", "hatch-fancy-pypi-readme>=22.5.0"] +build-backend = "hatchling.build" + +[project] +name = "Crawl4AI" +dynamic = ["version"] +description = "🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" +readme = "README.md" +license = "Apache-2.0" +requires-python = ">=3.7" +authors = [ + { name = "Unclecode", email = "unclecode@kidocode.com" }, +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", +] +dependencies = [ + "aiosqlite~=0.20", + "html2text~=2024.2", + "lxml~=5.3", + "litellm>=1.53.1", + "numpy>=1.26.0,<3", + "pillow~=10.4", + "playwright>=1.49.0", + "python-dotenv~=1.0", + "requests~=2.26", + "beautifulsoup4~=4.12", + "tf-playwright-stealth>=1.1.0", + "xxhash~=3.4", + "rank-bm25~=0.2", + "aiofiles>=24.1.0", + "colorama~=0.4", + "snowballstemmer~=2.2", +] + +[project.optional-dependencies] +torch = ["torch", "nltk", "scikit-learn"] +transformer = ["transformers", "tokenizers"] +cosine = ["torch", "transformers", "nltk"] +sync = ["selenium"] +all = [ + "torch", + "nltk", + "scikit-learn", + "transformers", + "tokenizers", + "selenium", +] + +[project.urls] +Homepage = "https://github.com/unclecode/crawl4ai" +Documentation = "https://crawl4ai.com/mkdocs/" + +[project.scripts] +crawl4ai-download-models = "crawl4ai.model_loader:main" +crawl4ai-migrate = "crawl4ai.migrations:main" +crawl4ai-post-install = "crawl4ai.post_install:main" + +[tool.hatch.version] +path = "crawl4ai/__version__.py" + +[tool.hatch.build.hooks.custom] +dependencies = ["hatch-fancy-pypi-readme>=22.5.0"] +path = "build_hooks.py" + +[project.entry-points.hatch] +crawl4ai = "crawl4ai.plugin:post_install" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index c0f6f183..00000000 --- a/requirements.txt +++ /dev/null @@ -1,16 +0,0 @@ -aiosqlite~=0.20 -html2text~=2024.2 -lxml~=5.3 -litellm>=1.53.1 -numpy>=1.26.0,<3 -pillow~=10.4 -playwright>=1.49.0 -python-dotenv~=1.0 -requests~=2.26 -beautifulsoup4~=4.12 -tf-playwright-stealth>=1.1.0 -xxhash~=3.4 -rank-bm25~=0.2 -aiofiles>=24.1.0 -colorama~=0.4 -snowballstemmer~=2.2 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 56490d6a..00000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[options] -include_package_data = True \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index d44169bf..00000000 --- a/setup.py +++ /dev/null @@ -1,136 +0,0 @@ -from setuptools import setup, find_packages -from setuptools.command.install import install -import os -from pathlib import Path -import shutil -import subprocess -import sys -import asyncio - -# Create the .crawl4ai folder in the user's home directory if it doesn't exist -# If the folder already exists, remove the cache folder -base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY") -crawl4ai_folder = Path(base_dir) if base_dir else Path.home() -crawl4ai_folder = crawl4ai_folder / ".crawl4ai" -cache_folder = crawl4ai_folder / "cache" -content_folders = [ - "html_content", - "cleaned_html", - "markdown_content", - "extracted_content", - "screenshots", -] - -# Clean up old cache if exists -if cache_folder.exists(): - shutil.rmtree(cache_folder) - -# Create new folder structure -crawl4ai_folder.mkdir(exist_ok=True) -cache_folder.mkdir(exist_ok=True) -for folder in content_folders: - (crawl4ai_folder / folder).mkdir(exist_ok=True) - -# Read requirements and version -__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) -with open(os.path.join(__location__, "requirements.txt")) as f: - requirements = f.read().splitlines() - -with open("crawl4ai/__version__.py") as f: - for line in f: - if line.startswith("__version__"): - version = line.split("=")[1].strip().strip('"') - break - -# Define requirements -default_requirements = requirements -torch_requirements = ["torch", "nltk", "scikit-learn"] -transformer_requirements = ["transformers", "tokenizers"] -cosine_similarity_requirements = ["torch", "transformers", "nltk"] -sync_requirements = ["selenium"] - - -def install_playwright(): - print("Installing Playwright browsers...") - try: - subprocess.check_call([sys.executable, "-m", "playwright", "install"]) - print("Playwright installation completed successfully.") - except subprocess.CalledProcessError as e: - print(f"Error during Playwright installation: {e}") - print( - "Please run 'python -m playwright install' manually after the installation." - ) - except Exception as e: - print(f"Unexpected error during Playwright installation: {e}") - print( - "Please run 'python -m playwright install' manually after the installation." - ) - - -def run_migration(): - """Initialize database during installation""" - try: - print("Starting database initialization...") - from crawl4ai.async_database import async_db_manager - - asyncio.run(async_db_manager.initialize()) - print("Database initialization completed successfully.") - except ImportError: - print("Warning: Database module not found. Will initialize on first use.") - except Exception as e: - print(f"Warning: Database initialization failed: {e}") - print("Database will be initialized on first use") - - -class PostInstallCommand(install): - def run(self): - install.run(self) - install_playwright() - # run_migration() - - -setup( - name="Crawl4AI", - version=version, - description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper", - long_description=open("README.md", encoding="utf-8").read(), - long_description_content_type="text/markdown", - url="https://github.com/unclecode/crawl4ai", - author="Unclecode", - author_email="unclecode@kidocode.com", - license="MIT", - packages=find_packages(), - install_requires=default_requirements - + ["playwright", "aiofiles"], # Added aiofiles - extras_require={ - "torch": torch_requirements, - "transformer": transformer_requirements, - "cosine": cosine_similarity_requirements, - "sync": sync_requirements, - "all": default_requirements - + torch_requirements - + transformer_requirements - + cosine_similarity_requirements - + sync_requirements, - }, - entry_points={ - "console_scripts": [ - "crawl4ai-download-models=crawl4ai.model_loader:main", - "crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command - ], - }, - classifiers=[ - "Development Status :: 3 - Alpha", - "Intended Audience :: Developers", - "License :: OSI Approved :: Apache Software License", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - ], - python_requires=">=3.7", - cmdclass={ - "install": PostInstallCommand, - }, -) From 12e73d489846dc83c29347bf84646ad8daef6cfc Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 16:01:19 +0800 Subject: [PATCH 38/70] refactor: remove legacy build hooks and setup files, migrate to setup.cfg and pyproject.toml --- MANIFEST.in | 1 + build_hooks.py | 48 ----------------- plugin.py | 9 ---- post_install.py | 19 ------- pyproject.toml | 75 -------------------------- requirements.txt | 16 ++++++ setup.cfg | 2 + setup.py | 136 +++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 155 insertions(+), 151 deletions(-) create mode 100644 MANIFEST.in delete mode 100644 build_hooks.py delete mode 100644 plugin.py delete mode 100644 post_install.py delete mode 100644 pyproject.toml create mode 100644 requirements.txt create mode 100644 setup.cfg create mode 100644 setup.py diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..540b7204 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include requirements.txt \ No newline at end of file diff --git a/build_hooks.py b/build_hooks.py deleted file mode 100644 index e59b5910..00000000 --- a/build_hooks.py +++ /dev/null @@ -1,48 +0,0 @@ -import os -import shutil -from pathlib import Path -import subprocess -import sys -from hatchling.builders.hooks.plugin.interface import BuildHookInterface -PLUGIN = "CustomBuildHook" - -class CustomBuildHook(BuildHookInterface): - def initialize(self, version, build_data): - # Create the .crawl4ai folder structure - base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY") - crawl4ai_folder = Path(base_dir) if base_dir else Path.home() - crawl4ai_folder = crawl4ai_folder / ".crawl4ai" - cache_folder = crawl4ai_folder / "cache" - content_folders = [ - "html_content", - "cleaned_html", - "markdown_content", - "extracted_content", - "screenshots", - ] - - # Clean up old cache if exists - if cache_folder.exists(): - shutil.rmtree(cache_folder) - - # Create new folder structure - crawl4ai_folder.mkdir(exist_ok=True) - cache_folder.mkdir(exist_ok=True) - for folder in content_folders: - (crawl4ai_folder / folder).mkdir(exist_ok=True) - - # Install Playwright browsers - try: - subprocess.check_call([sys.executable, "-m", "playwright", "install"]) - except Exception as e: - print(f"Warning: Playwright installation failed: {e}") - print("Please run 'python -m playwright install' manually after installation") - - # Initialize database - try: - from crawl4ai.async_database import async_db_manager - import asyncio - asyncio.run(async_db_manager.initialize()) - except Exception as e: - print(f"Warning: Database initialization failed: {e}") - print("Database will be initialized on first use") \ No newline at end of file diff --git a/plugin.py b/plugin.py deleted file mode 100644 index 1e1b11bf..00000000 --- a/plugin.py +++ /dev/null @@ -1,9 +0,0 @@ -from colorama import Fore, Style -import subprocess -import sys - -def post_install(): - print(f"\n{Fore.YELLOW}{'='*40}") - print(f"{Fore.RED}IMPORTANT: Run this command now:") - print(f"{Fore.GREEN}python -m playwright install") - print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n") \ No newline at end of file diff --git a/post_install.py b/post_install.py deleted file mode 100644 index e536e547..00000000 --- a/post_install.py +++ /dev/null @@ -1,19 +0,0 @@ -from colorama import Fore, Style -import subprocess -import sys -import distutils.log as log -from pathlib import Path - -def main(): - try: - subprocess.check_call([sys.executable, "-m", "playwright", "install"], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL) - except: - print(f"\n{Fore.YELLOW}{'='*40}") - print(f"{Fore.RED}IMPORTANT: Run this command now:") - print(f"{Fore.GREEN}python -m playwright install") - print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n") - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index cfef8101..00000000 --- a/pyproject.toml +++ /dev/null @@ -1,75 +0,0 @@ -[build-system] -requires = ["hatchling", "hatch-fancy-pypi-readme>=22.5.0"] -build-backend = "hatchling.build" - -[project] -name = "Crawl4AI" -dynamic = ["version"] -description = "🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" -readme = "README.md" -license = "Apache-2.0" -requires-python = ">=3.7" -authors = [ - { name = "Unclecode", email = "unclecode@kidocode.com" }, -] -classifiers = [ - "Development Status :: 3 - Alpha", - "Intended Audience :: Developers", - "License :: OSI Approved :: Apache Software License", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", -] -dependencies = [ - "aiosqlite~=0.20", - "html2text~=2024.2", - "lxml~=5.3", - "litellm>=1.53.1", - "numpy>=1.26.0,<3", - "pillow~=10.4", - "playwright>=1.49.0", - "python-dotenv~=1.0", - "requests~=2.26", - "beautifulsoup4~=4.12", - "tf-playwright-stealth>=1.1.0", - "xxhash~=3.4", - "rank-bm25~=0.2", - "aiofiles>=24.1.0", - "colorama~=0.4", - "snowballstemmer~=2.2", -] - -[project.optional-dependencies] -torch = ["torch", "nltk", "scikit-learn"] -transformer = ["transformers", "tokenizers"] -cosine = ["torch", "transformers", "nltk"] -sync = ["selenium"] -all = [ - "torch", - "nltk", - "scikit-learn", - "transformers", - "tokenizers", - "selenium", -] - -[project.urls] -Homepage = "https://github.com/unclecode/crawl4ai" -Documentation = "https://crawl4ai.com/mkdocs/" - -[project.scripts] -crawl4ai-download-models = "crawl4ai.model_loader:main" -crawl4ai-migrate = "crawl4ai.migrations:main" -crawl4ai-post-install = "crawl4ai.post_install:main" - -[tool.hatch.version] -path = "crawl4ai/__version__.py" - -[tool.hatch.build.hooks.custom] -dependencies = ["hatch-fancy-pypi-readme>=22.5.0"] -path = "build_hooks.py" - -[project.entry-points.hatch] -crawl4ai = "crawl4ai.plugin:post_install" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..c0f6f183 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,16 @@ +aiosqlite~=0.20 +html2text~=2024.2 +lxml~=5.3 +litellm>=1.53.1 +numpy>=1.26.0,<3 +pillow~=10.4 +playwright>=1.49.0 +python-dotenv~=1.0 +requests~=2.26 +beautifulsoup4~=4.12 +tf-playwright-stealth>=1.1.0 +xxhash~=3.4 +rank-bm25~=0.2 +aiofiles>=24.1.0 +colorama~=0.4 +snowballstemmer~=2.2 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..56490d6a --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[options] +include_package_data = True \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..d44169bf --- /dev/null +++ b/setup.py @@ -0,0 +1,136 @@ +from setuptools import setup, find_packages +from setuptools.command.install import install +import os +from pathlib import Path +import shutil +import subprocess +import sys +import asyncio + +# Create the .crawl4ai folder in the user's home directory if it doesn't exist +# If the folder already exists, remove the cache folder +base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY") +crawl4ai_folder = Path(base_dir) if base_dir else Path.home() +crawl4ai_folder = crawl4ai_folder / ".crawl4ai" +cache_folder = crawl4ai_folder / "cache" +content_folders = [ + "html_content", + "cleaned_html", + "markdown_content", + "extracted_content", + "screenshots", +] + +# Clean up old cache if exists +if cache_folder.exists(): + shutil.rmtree(cache_folder) + +# Create new folder structure +crawl4ai_folder.mkdir(exist_ok=True) +cache_folder.mkdir(exist_ok=True) +for folder in content_folders: + (crawl4ai_folder / folder).mkdir(exist_ok=True) + +# Read requirements and version +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) +with open(os.path.join(__location__, "requirements.txt")) as f: + requirements = f.read().splitlines() + +with open("crawl4ai/__version__.py") as f: + for line in f: + if line.startswith("__version__"): + version = line.split("=")[1].strip().strip('"') + break + +# Define requirements +default_requirements = requirements +torch_requirements = ["torch", "nltk", "scikit-learn"] +transformer_requirements = ["transformers", "tokenizers"] +cosine_similarity_requirements = ["torch", "transformers", "nltk"] +sync_requirements = ["selenium"] + + +def install_playwright(): + print("Installing Playwright browsers...") + try: + subprocess.check_call([sys.executable, "-m", "playwright", "install"]) + print("Playwright installation completed successfully.") + except subprocess.CalledProcessError as e: + print(f"Error during Playwright installation: {e}") + print( + "Please run 'python -m playwright install' manually after the installation." + ) + except Exception as e: + print(f"Unexpected error during Playwright installation: {e}") + print( + "Please run 'python -m playwright install' manually after the installation." + ) + + +def run_migration(): + """Initialize database during installation""" + try: + print("Starting database initialization...") + from crawl4ai.async_database import async_db_manager + + asyncio.run(async_db_manager.initialize()) + print("Database initialization completed successfully.") + except ImportError: + print("Warning: Database module not found. Will initialize on first use.") + except Exception as e: + print(f"Warning: Database initialization failed: {e}") + print("Database will be initialized on first use") + + +class PostInstallCommand(install): + def run(self): + install.run(self) + install_playwright() + # run_migration() + + +setup( + name="Crawl4AI", + version=version, + description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper", + long_description=open("README.md", encoding="utf-8").read(), + long_description_content_type="text/markdown", + url="https://github.com/unclecode/crawl4ai", + author="Unclecode", + author_email="unclecode@kidocode.com", + license="MIT", + packages=find_packages(), + install_requires=default_requirements + + ["playwright", "aiofiles"], # Added aiofiles + extras_require={ + "torch": torch_requirements, + "transformer": transformer_requirements, + "cosine": cosine_similarity_requirements, + "sync": sync_requirements, + "all": default_requirements + + torch_requirements + + transformer_requirements + + cosine_similarity_requirements + + sync_requirements, + }, + entry_points={ + "console_scripts": [ + "crawl4ai-download-models=crawl4ai.model_loader:main", + "crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command + ], + }, + classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + ], + python_requires=">=3.7", + cmdclass={ + "install": PostInstallCommand, + }, +) From d202f3539bf7447f7594f7f1897c3062c337ae52 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 18:48:44 +0800 Subject: [PATCH 39/70] Enhance installation and migration processes - Added a post-installation setup script for initialization. - Updated README with installation notes for Playwright setup. - Enhanced migration logging for better error visibility. - Added 'pydantic' to requirements. - Bumped version to 0.3.746. --- README.md | 32 ++----------------- crawl4ai/__init__.py | 1 - crawl4ai/__version__.py | 2 +- crawl4ai/install.py | 44 ++++++++++++++++++++++++++ crawl4ai/migrations.py | 40 ++++++++++++++++-------- docs/examples/quickstart_async.py | 18 +++++------ requirements.txt | 4 +-- setup.py | 51 ++----------------------------- 8 files changed, 90 insertions(+), 102 deletions(-) create mode 100644 crawl4ai/install.py diff --git a/README.md b/README.md index e8e6cddf..bbfa5858 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant 1. Install Crawl4AI: ```bash pip install crawl4ai +crawl4ai-setup # Setup the browser ``` 2. Run a simple web crawl: @@ -125,34 +126,6 @@ if __name__ == "__main__": ✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/) -## Features ✨ - -- 🆓 Completely free and open-source -- 🚀 Blazing fast performance, outperforming many paid services -- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown) -- 🌐 Multi-browser support (Chromium, Firefox, WebKit) -- 🌍 Supports crawling multiple URLs simultaneously -- 🎨 Extracts and returns all media tags (Images, Audio, and Video) -- 🔗 Extracts all external and internal links -- 📚 Extracts metadata from the page -- 🔄 Custom hooks for authentication, headers, and page modifications -- 🕵️ User-agent customization -- 🖼️ Takes screenshots of pages with enhanced error handling -- 📜 Executes multiple custom JavaScripts before crawling -- 📊 Generates structured output without LLM using JsonCssExtractionStrategy -- 📚 Various chunking strategies: topic-based, regex, sentence, and more -- 🧠 Advanced extraction strategies: cosine clustering, LLM, and more -- 🎯 CSS selector support for precise data extraction -- 📝 Passes instructions/keywords to refine extraction -- 🔒 Proxy support with authentication for enhanced access -- 🔄 Session management for complex multi-page crawling -- 🌐 Asynchronous architecture for improved performance -- 🖼️ Improved image processing with lazy-loading detection -- 🕰️ Enhanced handling of delayed content loading -- 🔑 Custom headers support for LLM interactions -- 🖼️ iframe content extraction for comprehensive analysis -- ⏱️ Flexible timeout and delayed content retrieval options - ## Installation 🛠️ Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker. @@ -168,11 +141,12 @@ For basic web crawling and scraping tasks: ```bash pip install crawl4ai +crawl4ai-setup # Setup the browser ``` By default, this will install the asynchronous version of Crawl4AI, using Playwright for web crawling. -👉 **Note**: When you install Crawl4AI, the setup script should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods: +👉 **Note**: When you install Crawl4AI, the `crawl4ai-setup` should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods: 1. Through the command line: diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 0ccf13d8..cee7c25b 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -4,7 +4,6 @@ from .async_webcrawler import AsyncWebCrawler, CacheMode from .models import CrawlResult from .__version__ import __version__ -# __version__ = "0.3.73" __all__ = [ "AsyncWebCrawler", diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 8b69d491..4a938b75 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.745" +__version__ = "0.3.746" diff --git a/crawl4ai/install.py b/crawl4ai/install.py new file mode 100644 index 00000000..71fe30ea --- /dev/null +++ b/crawl4ai/install.py @@ -0,0 +1,44 @@ +import subprocess +import sys +import asyncio +from .async_logger import AsyncLogger, LogLevel + +# Initialize logger +logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True) + +def post_install(): + """Run all post-installation tasks""" + logger.info("Running post-installation setup...", tag="INIT") + install_playwright() + run_migration() + logger.success("Post-installation setup completed!", tag="COMPLETE") + +def install_playwright(): + logger.info("Installing Playwright browsers...", tag="INIT") + try: + subprocess.check_call([sys.executable, "-m", "playwright", "install"]) + logger.success("Playwright installation completed successfully.", tag="COMPLETE") + except subprocess.CalledProcessError as e: + logger.error(f"Error during Playwright installation: {e}", tag="ERROR") + logger.warning( + "Please run 'python -m playwright install' manually after the installation." + ) + except Exception as e: + logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR") + logger.warning( + "Please run 'python -m playwright install' manually after the installation." + ) + +def run_migration(): + """Initialize database during installation""" + try: + logger.info("Starting database initialization...", tag="INIT") + from crawl4ai.async_database import async_db_manager + + asyncio.run(async_db_manager.initialize()) + logger.success("Database initialization completed successfully.", tag="COMPLETE") + except ImportError: + logger.warning("Database module not found. Will initialize on first use.") + except Exception as e: + logger.warning(f"Database initialization failed: {e}") + logger.warning("Database will be initialized on first use") \ No newline at end of file diff --git a/crawl4ai/migrations.py b/crawl4ai/migrations.py index 77616086..3386b0fb 100644 --- a/crawl4ai/migrations.py +++ b/crawl4ai/migrations.py @@ -9,9 +9,13 @@ import aiofiles import shutil import time from datetime import datetime +from .async_logger import AsyncLogger, LogLevel -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) +# Initialize logger +logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True) + +# logging.basicConfig(level=logging.INFO) +# logger = logging.getLogger(__name__) class DatabaseMigration: def __init__(self, db_path: str): @@ -55,7 +59,8 @@ class DatabaseMigration: async def migrate_database(self): """Migrate existing database to file-based storage""" - logger.info("Starting database migration...") + # logger.info("Starting database migration...") + logger.info("Starting database migration...", tag="INIT") try: async with aiosqlite.connect(self.db_path) as db: @@ -91,19 +96,25 @@ class DatabaseMigration: migrated_count += 1 if migrated_count % 100 == 0: - logger.info(f"Migrated {migrated_count} records...") + logger.info(f"Migrated {migrated_count} records...", tag="INIT") + await db.commit() - logger.info(f"Migration completed. {migrated_count} records processed.") + logger.success(f"Migration completed. {migrated_count} records processed.", tag="COMPLETE") except Exception as e: - logger.error(f"Migration failed: {e}") - raise + # logger.error(f"Migration failed: {e}") + logger.error( + message="Migration failed: {error}", + tag="ERROR", + params={"error": str(e)} + ) + raise e async def backup_database(db_path: str) -> str: """Create backup of existing database""" if not os.path.exists(db_path): - logger.info("No existing database found. Skipping backup.") + logger.info("No existing database found. Skipping backup.", tag="INIT") return None # Create backup with timestamp @@ -116,11 +127,16 @@ async def backup_database(db_path: str) -> str: # Create backup shutil.copy2(db_path, backup_path) - logger.info(f"Database backup created at: {backup_path}") + logger.info(f"Database backup created at: {backup_path}", tag="COMPLETE") return backup_path except Exception as e: - logger.error(f"Backup failed: {e}") - raise + # logger.error(f"Backup failed: {e}") + logger.error( + message="Migration failed: {error}", + tag="ERROR", + params={"error": str(e)} + ) + raise e async def run_migration(db_path: Optional[str] = None): """Run database migration""" @@ -128,7 +144,7 @@ async def run_migration(db_path: Optional[str] = None): db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db") if not os.path.exists(db_path): - logger.info("No existing database found. Skipping migration.") + logger.info("No existing database found. Skipping migration.", tag="INIT") return # Create backup first diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 01f7677c..679a9bc2 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -562,18 +562,18 @@ async def fit_markdown_remove_overlay(): async def main(): - # await simple_crawl() - # await simple_example_with_running_js_code() - # await simple_example_with_css_selector() + await simple_crawl() + await simple_example_with_running_js_code() + await simple_example_with_css_selector() # await use_proxy() - # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) - # await extract_structured_data_using_css_extractor() + await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) + await extract_structured_data_using_css_extractor() # LLM extraction examples # await extract_structured_data_using_llm() # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) # await extract_structured_data_using_llm("ollama/llama3.2") - # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) + await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) # You always can pass custom headers to the extraction strategy # custom_headers = { @@ -582,9 +582,9 @@ async def main(): # } # await extract_structured_data_using_llm(extra_headers=custom_headers) - # await crawl_dynamic_content_pages_method_1() - # await crawl_dynamic_content_pages_method_2() - # await crawl_dynamic_content_pages_method_3() + await crawl_dynamic_content_pages_method_1() + await crawl_dynamic_content_pages_method_2() + await crawl_dynamic_content_pages_method_3() await crawl_custom_browser_type() diff --git a/requirements.txt b/requirements.txt index c0f6f183..741e12ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ aiosqlite~=0.20 -html2text~=2024.2 lxml~=5.3 litellm>=1.53.1 numpy>=1.26.0,<3 @@ -13,4 +12,5 @@ xxhash~=3.4 rank-bm25~=0.2 aiofiles>=24.1.0 colorama~=0.4 -snowballstemmer~=2.2 \ No newline at end of file +snowballstemmer~=2.2 +pydantic>=2.10 \ No newline at end of file diff --git a/setup.py b/setup.py index d44169bf..e6840cd0 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,8 @@ from setuptools import setup, find_packages -from setuptools.command.install import install import os from pathlib import Path import shutil -import subprocess -import sys -import asyncio + # Create the .crawl4ai folder in the user's home directory if it doesn't exist # If the folder already exists, remove the cache folder @@ -49,46 +46,6 @@ transformer_requirements = ["transformers", "tokenizers"] cosine_similarity_requirements = ["torch", "transformers", "nltk"] sync_requirements = ["selenium"] - -def install_playwright(): - print("Installing Playwright browsers...") - try: - subprocess.check_call([sys.executable, "-m", "playwright", "install"]) - print("Playwright installation completed successfully.") - except subprocess.CalledProcessError as e: - print(f"Error during Playwright installation: {e}") - print( - "Please run 'python -m playwright install' manually after the installation." - ) - except Exception as e: - print(f"Unexpected error during Playwright installation: {e}") - print( - "Please run 'python -m playwright install' manually after the installation." - ) - - -def run_migration(): - """Initialize database during installation""" - try: - print("Starting database initialization...") - from crawl4ai.async_database import async_db_manager - - asyncio.run(async_db_manager.initialize()) - print("Database initialization completed successfully.") - except ImportError: - print("Warning: Database module not found. Will initialize on first use.") - except Exception as e: - print(f"Warning: Database initialization failed: {e}") - print("Database will be initialized on first use") - - -class PostInstallCommand(install): - def run(self): - install.run(self) - install_playwright() - # run_migration() - - setup( name="Crawl4AI", version=version, @@ -116,7 +73,8 @@ setup( entry_points={ "console_scripts": [ "crawl4ai-download-models=crawl4ai.model_loader:main", - "crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command + "crawl4ai-migrate=crawl4ai.migrations:main", + 'crawl4ai-setup=crawl4ai.install:post_install', ], }, classifiers=[ @@ -130,7 +88,4 @@ setup( "Programming Language :: Python :: 3.10", ], python_requires=">=3.7", - cmdclass={ - "install": PostInstallCommand, - }, ) From 93bf3e8a1f87760e04d6a18b2e27bae0f5d5da0e Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 20:08:09 +0800 Subject: [PATCH 40/70] Refactor Dockerfile and clean up main.py - Enhanced Dockerfile for platform-specific installations - Added ARG for TARGETPLATFORM and BUILDPLATFORM - Improved GPU support conditional on TARGETPLATFORM - Removed static pages mounting in main.py - Streamlined code structure to improve maintainability --- Dockerfile | 25 ++++++++++++++++--------- main.py | 4 ---- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/Dockerfile b/Dockerfile index bd71deae..2997590a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,9 @@ # syntax=docker/dockerfile:1.4 -# Build arguments +ARG TARGETPLATFORM +ARG BUILDPLATFORM + +# Other build arguments ARG PYTHON_VERSION=3.10 # Base stage with system dependencies @@ -63,13 +66,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ && rm -rf /var/lib/apt/lists/* # GPU support if enabled and architecture is supported -RUN if [ "$ENABLE_GPU" = "true" ] && [ "$(dpkg --print-architecture)" != "arm64" ] ; then \ - apt-get update && apt-get install -y --no-install-recommends \ - nvidia-cuda-toolkit \ - && rm -rf /var/lib/apt/lists/* ; \ - else \ - echo "Skipping NVIDIA CUDA Toolkit installation (unsupported architecture or GPU disabled)"; \ - fi +RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETPLATFORM" = "linux/amd64" ] ; then \ + apt-get update && apt-get install -y --no-install-recommends \ + nvidia-cuda-toolkit \ + && rm -rf /var/lib/apt/lists/* ; \ +else \ + echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \ +fi # Create and set working directory WORKDIR /app @@ -120,7 +123,11 @@ RUN pip install --no-cache-dir \ RUN mkdocs build # Install Playwright and browsers -RUN playwright install +RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \ + playwright install chromium; \ + elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + playwright install chromium; \ + fi # Expose port EXPOSE 8000 11235 9222 8080 diff --git a/main.py b/main.py index 6d217410..d6c792e8 100644 --- a/main.py +++ b/main.py @@ -340,9 +340,6 @@ app.add_middleware( allow_headers=["*"], # Allows all headers ) -# Mount the pages directory as a static directory -app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages") - # API token security security = HTTPBearer() CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code" @@ -364,7 +361,6 @@ if os.path.exists(__location__ + "/site"): app.mount("/mkdocs", StaticFiles(directory="site", html=True), name="mkdocs") site_templates = Jinja2Templates(directory=__location__ + "/site") -templates = Jinja2Templates(directory=__location__ + "/pages") crawler_service = CrawlerService() From f9c98a377dd1dda28f88cd5ab4e801535a88abcc Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 20:52:51 +0800 Subject: [PATCH 41/70] Enhance Docker support and improve installation process - Added new Docker commands for platform-specific builds. - Updated README with comprehensive installation and setup instructions. - Introduced `post_install` method in setup script for automation. - Refined migration processes with enhanced error logging. - Bump version to 0.3.746 and updated dependencies. --- CHANGELOG.md | 59 +++++++++++ README.md | 177 +++++++++++++++++++++++++++----- docker-compose.yml | 65 ++++++------ docs/examples/docker_example.py | 22 ++-- 4 files changed, 256 insertions(+), 67 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ec79639..309218dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,64 @@ # Changelog +## [0.3.746] November 29, 2024 + +### Major Features +1. Enhanced Docker Support (Nov 29, 2024) + - Improved GPU support in Docker images. + - Dockerfile refactored for better platform-specific installations. + - Introduced new Docker commands for different platforms: + - `basic-amd64`, `all-amd64`, `gpu-amd64` for AMD64. + - `basic-arm64`, `all-arm64`, `gpu-arm64` for ARM64. + +### Infrastructure & Documentation +- Enhanced README.md to improve user guidance and installation instructions. +- Added installation instructions for Playwright setup in README. +- Created and updated examples in `docs/examples/quickstart_async.py` to be more useful and user-friendly. +- Updated `requirements.txt` with a new `pydantic` dependency. +- Bumped version number in `crawl4ai/__version__.py` to 0.3.746. + +### Breaking Changes +- Streamlined application structure: + - Removed static pages and related code from `main.py` which might affect existing deployments relying on static content. + +### Development Updates +- Developed `post_install` method in `crawl4ai/install.py` to streamline post-installation setup tasks. +- Refined migration processes in `crawl4ai/migrations.py` with enhanced logging for better error visibility. +- Updated `docker-compose.yml` to support local and hub services for different architectures, enhancing build and deploy capabilities. +- Refactored example test cases in `docs/examples/docker_example.py` to facilitate comprehensive testing. + +### README.md +Updated README with new docker commands and setup instructions. +Enhanced installation instructions and guidance. + +### crawl4ai/install.py +Added post-install script functionality. +Introduced `post_install` method for automation of post-installation tasks. + +### crawl4ai/migrations.py +Improved migration logging. +Refined migration processes and added better logging. + +### docker-compose.yml +Refactored docker-compose for better service management. +Updated to define services for different platforms and versions. + +### requirements.txt +Updated dependencies. +Added `pydantic` to requirements file. + +### crawler/__version__.py +Updated version number. +Bumped version number to 0.3.746. + +### docs/examples/quickstart_async.py +Enhanced example scripts. +Uncommented example usage in async guide for user functionality. + +### main.py +Refactored code to improve maintainability. +Streamlined app structure by removing static pages code. + ## [0.3.743] November 27, 2024 Enhance features and documentation diff --git a/README.md b/README.md index bbfa5858..3d89ee19 100644 --- a/README.md +++ b/README.md @@ -220,48 +220,173 @@ Crawl4AI is available as Docker images for easy deployment. You can either pull --- -### Option 1: Docker Hub (Recommended) +
+🐳 Option 1: Docker Hub (Recommended) +Choose the appropriate image based on your platform and needs: + +### For AMD64 (Regular Linux/Windows): ```bash -# Pull and run from Docker Hub (choose one): -docker pull unclecode/crawl4ai:basic # Basic crawling features -docker pull unclecode/crawl4ai:all # Full installation (ML, LLM support) -docker pull unclecode/crawl4ai:gpu # GPU-enabled version +# Basic version (recommended) +docker pull unclecode/crawl4ai:basic-amd64 +docker run -p 11235:11235 unclecode/crawl4ai:basic-amd64 -# Run the container -docker run -p 11235:11235 unclecode/crawl4ai:basic # Replace 'basic' with your chosen version +# Full ML/LLM support +docker pull unclecode/crawl4ai:all-amd64 +docker run -p 11235:11235 unclecode/crawl4ai:all-amd64 -# In case you want to set platform to arm64 -docker run --platform linux/arm64 -p 11235:11235 unclecode/crawl4ai:basic - -# In case to allocate more shared memory for the container -docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic +# With GPU support +docker pull unclecode/crawl4ai:gpu-amd64 +docker run -p 11235:11235 unclecode/crawl4ai:gpu-amd64 ``` ---- +### For ARM64 (M1/M2 Macs, ARM servers): +```bash +# Basic version (recommended) +docker pull unclecode/crawl4ai:basic-arm64 +docker run -p 11235:11235 unclecode/crawl4ai:basic-arm64 -### Option 2: Build from Repository +# Full ML/LLM support +docker pull unclecode/crawl4ai:all-arm64 +docker run -p 11235:11235 unclecode/crawl4ai:all-arm64 + +# With GPU support +docker pull unclecode/crawl4ai:gpu-arm64 +docker run -p 11235:11235 unclecode/crawl4ai:gpu-arm64 +``` + +Need more memory? Add `--shm-size`: +```bash +docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic-amd64 +``` + +Test the installation: +```bash +curl http://localhost:11235/health +``` + +### For Raspberry Pi (32-bit) (Experimental) +```bash +# Pull and run basic version (recommended for Raspberry Pi) +docker pull unclecode/crawl4ai:basic-armv7 +docker run -p 11235:11235 unclecode/crawl4ai:basic-armv7 + +# With increased shared memory if needed +docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic-armv7 +``` + +Note: Due to hardware constraints, only the basic version is recommended for Raspberry Pi. + +
+ +
+🐳 Option 2: Build from Repository + +Build the image locally based on your platform: ```bash # Clone the repository git clone https://github.com/unclecode/crawl4ai.git cd crawl4ai -# Build the image -docker build -t crawl4ai:local \ - --build-arg INSTALL_TYPE=basic \ # Options: basic, all +# For AMD64 (Regular Linux/Windows) +docker build --platform linux/amd64 \ + --tag crawl4ai:local \ + --build-arg INSTALL_TYPE=basic \ . -# In case you want to set platform to arm64 -docker build -t crawl4ai:local \ - --build-arg INSTALL_TYPE=basic \ # Options: basic, all - --platform linux/arm64 \ +# For ARM64 (M1/M2 Macs, ARM servers) +docker build --platform linux/arm64 \ + --tag crawl4ai:local \ + --build-arg INSTALL_TYPE=basic \ . - -# Run your local build -docker run -p 11235:11235 crawl4ai:local ``` +Build options: +- INSTALL_TYPE=basic (default): Basic crawling features +- INSTALL_TYPE=all: Full ML/LLM support +- ENABLE_GPU=true: Add GPU support + +Example with all options: +```bash +docker build --platform linux/amd64 \ + --tag crawl4ai:local \ + --build-arg INSTALL_TYPE=all \ + --build-arg ENABLE_GPU=true \ + . +``` + +Run your local build: +```bash +# Regular run +docker run -p 11235:11235 crawl4ai:local + +# With increased shared memory +docker run --shm-size=2gb -p 11235:11235 crawl4ai:local +``` + +Test the installation: +```bash +curl http://localhost:11235/health +``` + +
+ +
+🐳 Option 3: Using Docker Compose + +Docker Compose provides a more structured way to run Crawl4AI, especially when dealing with environment variables and multiple configurations. + +```bash +# Clone the repository +git clone https://github.com/unclecode/crawl4ai.git +cd crawl4ai +``` + +### For AMD64 (Regular Linux/Windows): +```bash +# Build and run locally +docker-compose --profile local-amd64 up + +# Run from Docker Hub +VERSION=basic docker-compose --profile hub-amd64 up # Basic version +VERSION=all docker-compose --profile hub-amd64 up # Full ML/LLM support +VERSION=gpu docker-compose --profile hub-amd64 up # GPU support +``` + +### For ARM64 (M1/M2 Macs, ARM servers): +```bash +# Build and run locally +docker-compose --profile local-arm64 up + +# Run from Docker Hub +VERSION=basic docker-compose --profile hub-arm64 up # Basic version +VERSION=all docker-compose --profile hub-arm64 up # Full ML/LLM support +VERSION=gpu docker-compose --profile hub-arm64 up # GPU support +``` + +Environment variables (optional): +```bash +# Create a .env file +CRAWL4AI_API_TOKEN=your_token +OPENAI_API_KEY=your_openai_key +CLAUDE_API_KEY=your_claude_key +``` + +The compose file includes: +- Memory management (4GB limit, 1GB reserved) +- Shared memory volume for browser support +- Health checks +- Auto-restart policy +- All necessary port mappings + +Test the installation: +```bash +curl http://localhost:11235/health +``` + +
+ --- ### Quick Test @@ -278,11 +403,11 @@ response = requests.post( ) task_id = response.json()["task_id"] -# Get results +# Continue polling until the task is complete (status="completed") result = requests.get(f"http://localhost:11235/task/{task_id}") ``` -For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://crawl4ai.com/mkdocs/basic/docker-deployment/). +For more examples, see our [Docker Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py). For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://crawl4ai.com/mkdocs/basic/docker-deployment/).
diff --git a/docker-compose.yml b/docker-compose.yml index b93beda9..4b22fd98 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,6 @@ services: - crawl4ai: + # Local build services for different platforms + crawl4ai-amd64: build: context: . dockerfile: Dockerfile @@ -7,35 +8,39 @@ services: PYTHON_VERSION: "3.10" INSTALL_TYPE: ${INSTALL_TYPE:-basic} ENABLE_GPU: false - profiles: ["local"] - ports: - - "11235:11235" - - "8000:8000" - - "9222:9222" - - "8080:8080" - environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} - - OPENAI_API_KEY=${OPENAI_API_KEY:-} - - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} - volumes: - - /dev/shm:/dev/shm - deploy: - resources: - limits: - memory: 4G - reservations: - memory: 1G - restart: unless-stopped - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:11235/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s + platforms: + - linux/amd64 + profiles: ["local-amd64"] + extends: &base-config + file: docker-compose.yml + service: base-config - crawl4ai-hub: - image: unclecode/crawl4ai:basic - profiles: ["hub"] + crawl4ai-arm64: + build: + context: . + dockerfile: Dockerfile + args: + PYTHON_VERSION: "3.10" + INSTALL_TYPE: ${INSTALL_TYPE:-basic} + ENABLE_GPU: false + platforms: + - linux/arm64 + profiles: ["local-arm64"] + extends: *base-config + + # Hub services for different platforms and versions + crawl4ai-hub-amd64: + image: unclecode/crawl4ai:${VERSION:-basic}-amd64 + profiles: ["hub-amd64"] + extends: *base-config + + crawl4ai-hub-arm64: + image: unclecode/crawl4ai:${VERSION:-basic}-arm64 + profiles: ["hub-arm64"] + extends: *base-config + + # Base configuration to be extended + base-config: ports: - "11235:11235" - "8000:8000" @@ -59,4 +64,4 @@ services: interval: 30s timeout: 10s retries: 3 - start_period: 40s + start_period: 40s \ No newline at end of file diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py index 17ef9f04..48acc809 100644 --- a/docs/examples/docker_example.py +++ b/docs/examples/docker_example.py @@ -78,20 +78,20 @@ def test_docker_deployment(version="basic"): time.sleep(5) # Test cases based on version - # test_basic_crawl(tester) - # test_basic_crawl(tester) - # test_basic_crawl_sync(tester) test_basic_crawl_direct(tester) + test_basic_crawl(tester) + test_basic_crawl(tester) + test_basic_crawl_sync(tester) - # if version in ["full", "transformer"]: - # test_cosine_extraction(tester) + if version in ["full", "transformer"]: + test_cosine_extraction(tester) - # test_js_execution(tester) - # test_css_selector(tester) - # test_structured_extraction(tester) - # test_llm_extraction(tester) - # test_llm_with_ollama(tester) - # test_screenshot(tester) + test_js_execution(tester) + test_css_selector(tester) + test_structured_extraction(tester) + test_llm_extraction(tester) + test_llm_with_ollama(tester) + test_screenshot(tester) def test_basic_crawl(tester: Crawl4AiTester): From 1def53b7fe60267d5bc1f492f50b5f53f8858eee Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 20:53:43 +0800 Subject: [PATCH 42/70] docs: update Raspberry Pi section to indicate upcoming support --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3d89ee19..405c1002 100644 --- a/README.md +++ b/README.md @@ -265,7 +265,7 @@ Test the installation: curl http://localhost:11235/health ``` -### For Raspberry Pi (32-bit) (Experimental) +### For Raspberry Pi (32-bit) (coming soon): ```bash # Pull and run basic version (recommended for Raspberry Pi) docker pull unclecode/crawl4ai:basic-armv7 From 1ed7c15118fc81427fa29afe6368eb2a47720fd4 Mon Sep 17 00:00:00 2001 From: dvschuyl <125589423+dvschuyl@users.noreply.github.com> Date: Fri, 29 Nov 2024 14:06:04 +0100 Subject: [PATCH 43/70] :adhesive_bandage: Page-evaluate navigation destroyed error (#304) Thanks for your contribution and such a nice approach. Now that I think of it, I guess I can make good use of this for some other part of the code. By the way, thank you so much; I will add your name to the new list of contributors. --- crawl4ai/async_crawler_strategy.py | 1 + 1 file changed, 1 insertion(+) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index e5316187..a41d29a8 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -920,6 +920,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): }); } """ + await page.wait_for_load_state() await page.evaluate(update_image_dimensions_js) # Wait a bit for any onload events to complete From 0780db55e1298e73178077ec0bdc65cd534faa8d Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 21:12:19 +0800 Subject: [PATCH 44/70] fix: handle errors during image dimension updates in AsyncPlaywrightCrawlerStrategy --- crawl4ai/async_crawler_strategy.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index e5316187..cc7f3993 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -920,7 +920,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): }); } """ - await page.evaluate(update_image_dimensions_js) + try: + await page.wait_for_load_state() + await page.evaluate(update_image_dimensions_js) + except Exception as e: + raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}") # Wait a bit for any onload events to complete await page.wait_for_timeout(100) From 8c76a8c7dcb2820a351eeb5696db2fc04fce7805 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 21:14:49 +0800 Subject: [PATCH 45/70] docs: add contributor entry for dvschuyl regarding AsyncPlaywrightCrawlerStrategy issue --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 663e5541..79038bdd 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -18,6 +18,7 @@ We would like to thank the following people for their contributions to Crawl4AI: ## Pull Requests +- [dvschuyl](https://github.com/dvschuyl) - AsyncPlaywrightCrawlerStrategy page-evaluate context destroyed by navigation [#304](https://github.com/unclecode/crawl4ai/pull/304) - [nelzomal](https://github.com/nelzomal) - Enhance development installation instructions [#286](https://github.com/unclecode/crawl4ai/pull/286) - [HamzaFarhan](https://github.com/HamzaFarhan) - Handled the cases where markdown_with_citations, references_markdown, and filtered_html might not be defined [#293](https://github.com/unclecode/crawl4ai/pull/293) - [NanmiCoder](https://github.com/NanmiCoder) - fix: crawler strategy exception handling and fixes [#271](https://github.com/unclecode/crawl4ai/pull/271) From 3e83893b3f41b7176f6ec0beaccab9f2b159785d Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 30 Nov 2024 18:13:12 +0800 Subject: [PATCH 46/70] Enhance User-Agent Handling - Added a new UserAgentGenerator class for generating random User-Agents. - Integrated User-Agent generation in AsyncPlaywrightCrawlerStrategy for randomization. - Enhanced HTTP headers with generated Client Hints. --- crawl4ai/async_crawler_strategy.py | 33 +++- crawl4ai/user_agent_generator.py | 262 +++++++++++++++++++++++++++++ 2 files changed, 289 insertions(+), 6 deletions(-) create mode 100644 crawl4ai/user_agent_generator.py diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index cc7f3993..3d24bd84 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -16,6 +16,7 @@ import json import uuid from .models import AsyncCrawlResponse from .utils import create_box_message +from .user_agent_generator import UserAgentGenerator from playwright_stealth import StealthConfig, stealth_async stealth_config = StealthConfig( @@ -222,14 +223,20 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.use_cached_html = use_cached_html self.user_agent = kwargs.get( "user_agent", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36" ) + user_agenr_generator = UserAgentGenerator() + if kwargs.get("user_agent_mode") == "random": + self.user_agent = user_agenr_generator.generate( + **kwargs.get("user_agent_generator_config", {}) + ) self.proxy = kwargs.get("proxy") self.proxy_config = kwargs.get("proxy_config") self.headless = kwargs.get("headless", True) self.browser_type = kwargs.get("browser_type", "chromium") self.headers = kwargs.get("headers", {}) + self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent) + self.headers.setdefault("sec-ch-ua", self.browser_hint) self.cookies = kwargs.get("cookies", []) self.sessions = {} self.session_ttl = 1800 @@ -307,7 +314,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.user_agent: await self.default_context.set_extra_http_headers({ - "User-Agent": self.user_agent + "User-Agent": self.user_agent, + "sec-ch-ua": self.browser_hint, + # **self.headers }) else: # Base browser arguments @@ -321,7 +330,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "--disable-infobars", "--window-position=0,0", "--ignore-certificate-errors", - "--ignore-certificate-errors-spki-list" + "--ignore-certificate-errors-spki-list", + "--disable-blink-features=AutomationControlled", + ] } @@ -642,6 +653,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self._cleanup_expired_sessions() session_id = kwargs.get("session_id") + # Check if in kwargs we have user_agent that will override the default user_agent + user_agent = kwargs.get("user_agent", self.user_agent) + + # Generate random user agent if magic mode is enabled and user_agent_mode is not random + if kwargs.get("user_agent_mode") != "random" and kwargs.get("magic", False): + user_agent = UserAgentGenerator().generate( + **kwargs.get("user_agent_generator_config", {}) + ) + # Handle page creation differently for managed browser context = None if self.use_managed_browser: @@ -666,7 +686,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): else: # Normal context creation for non-persistent or non-Chrome browsers context = await self.browser.new_context( - user_agent=self.user_agent, + user_agent=user_agent, viewport={"width": 1200, "height": 800}, proxy={"server": self.proxy} if self.proxy else None, java_script_enabled=True, @@ -686,10 +706,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): else: # Normal context creation context = await self.browser.new_context( - user_agent=self.user_agent, + user_agent=user_agent, viewport={"width": 1920, "height": 1080}, proxy={"server": self.proxy} if self.proxy else None, accept_downloads=self.accept_downloads, + ignore_https_errors=True # Add this line ) if self.cookies: await context.add_cookies(self.cookies) diff --git a/crawl4ai/user_agent_generator.py b/crawl4ai/user_agent_generator.py new file mode 100644 index 00000000..0a4df0bb --- /dev/null +++ b/crawl4ai/user_agent_generator.py @@ -0,0 +1,262 @@ +import random +from typing import Optional, Literal, List, Dict, Tuple +import re + + +class UserAgentGenerator: + def __init__(self): + # Previous platform definitions remain the same... + self.desktop_platforms = { + "windows": { + "10_64": "(Windows NT 10.0; Win64; x64)", + "10_32": "(Windows NT 10.0; WOW64)", + }, + "macos": { + "intel": "(Macintosh; Intel Mac OS X 10_15_7)", + "newer": "(Macintosh; Intel Mac OS X 10.15; rv:109.0)", + }, + "linux": { + "generic": "(X11; Linux x86_64)", + "ubuntu": "(X11; Ubuntu; Linux x86_64)", + "chrome_os": "(X11; CrOS x86_64 14541.0.0)", + } + } + + self.mobile_platforms = { + "android": { + "samsung": "(Linux; Android 13; SM-S901B)", + "pixel": "(Linux; Android 12; Pixel 6)", + "oneplus": "(Linux; Android 13; OnePlus 9 Pro)", + "xiaomi": "(Linux; Android 12; M2102J20SG)", + }, + "ios": { + "iphone": "(iPhone; CPU iPhone OS 16_5 like Mac OS X)", + "ipad": "(iPad; CPU OS 16_5 like Mac OS X)", + } + } + + # Browser Combinations + self.browser_combinations = { + 1: [ + ["chrome"], + ["firefox"], + ["safari"], + ["edge"] + ], + 2: [ + ["gecko", "firefox"], + ["chrome", "safari"], + ["webkit", "safari"] + ], + 3: [ + ["chrome", "safari", "edge"], + ["webkit", "chrome", "safari"] + ] + } + + # Rendering Engines with versions + self.rendering_engines = { + "chrome_webkit": "AppleWebKit/537.36", + "safari_webkit": "AppleWebKit/605.1.15", + "gecko": [ # Added Gecko versions + "Gecko/20100101", + "Gecko/20100101", # Firefox usually uses this constant version + "Gecko/2010010", + ] + } + + # Browser Versions + self.chrome_versions = [ + "Chrome/119.0.6045.199", + "Chrome/118.0.5993.117", + "Chrome/117.0.5938.149", + "Chrome/116.0.5845.187", + "Chrome/115.0.5790.171", + ] + + self.edge_versions = [ + "Edg/119.0.2151.97", + "Edg/118.0.2088.76", + "Edg/117.0.2045.47", + "Edg/116.0.1938.81", + "Edg/115.0.1901.203", + ] + + self.safari_versions = [ + "Safari/537.36", # For Chrome-based + "Safari/605.1.15", + "Safari/604.1", + "Safari/602.1", + "Safari/601.5.17", + ] + + # Added Firefox versions + self.firefox_versions = [ + "Firefox/119.0", + "Firefox/118.0.2", + "Firefox/117.0.1", + "Firefox/116.0", + "Firefox/115.0.3", + "Firefox/114.0.2", + "Firefox/113.0.1", + "Firefox/112.0", + "Firefox/111.0.1", + "Firefox/110.0", + ] + + def get_browser_stack(self, num_browsers: int = 1) -> List[str]: + """Get a valid combination of browser versions""" + if num_browsers not in self.browser_combinations: + raise ValueError(f"Unsupported number of browsers: {num_browsers}") + + combination = random.choice(self.browser_combinations[num_browsers]) + browser_stack = [] + + for browser in combination: + if browser == "chrome": + browser_stack.append(random.choice(self.chrome_versions)) + elif browser == "firefox": + browser_stack.append(random.choice(self.firefox_versions)) + elif browser == "safari": + browser_stack.append(random.choice(self.safari_versions)) + elif browser == "edge": + browser_stack.append(random.choice(self.edge_versions)) + elif browser == "gecko": + browser_stack.append(random.choice(self.rendering_engines["gecko"])) + elif browser == "webkit": + browser_stack.append(self.rendering_engines["chrome_webkit"]) + + return browser_stack + + def generate(self, + device_type: Optional[Literal['desktop', 'mobile']] = None, + os_type: Optional[str] = None, + device_brand: Optional[str] = None, + browser_type: Optional[Literal['chrome', 'edge', 'safari', 'firefox']] = None, + num_browsers: int = 3) -> str: + """ + Generate a random user agent with specified constraints. + + Args: + device_type: 'desktop' or 'mobile' + os_type: 'windows', 'macos', 'linux', 'android', 'ios' + device_brand: Specific device brand + browser_type: 'chrome', 'edge', 'safari', or 'firefox' + num_browsers: Number of browser specifications (1-3) + """ + # Get platform string + platform = self.get_random_platform(device_type, os_type, device_brand) + + # Start with Mozilla + components = ["Mozilla/5.0", platform] + + # Add browser stack + browser_stack = self.get_browser_stack(num_browsers) + + # Add appropriate legacy token based on browser stack + if "Firefox" in str(browser_stack): + components.append(random.choice(self.rendering_engines["gecko"])) + elif "Chrome" in str(browser_stack) or "Safari" in str(browser_stack): + components.append(self.rendering_engines["chrome_webkit"]) + components.append("(KHTML, like Gecko)") + + # Add browser versions + components.extend(browser_stack) + + return " ".join(components) + + def generate_with_client_hints(self, **kwargs) -> Tuple[str, str]: + """Generate both user agent and matching client hints""" + user_agent = self.generate(**kwargs) + client_hints = self.generate_client_hints(user_agent) + return user_agent, client_hints + + def get_random_platform(self, device_type, os_type, device_brand): + """Helper method to get random platform based on constraints""" + platforms = self.desktop_platforms if device_type == 'desktop' else \ + self.mobile_platforms if device_type == 'mobile' else \ + {**self.desktop_platforms, **self.mobile_platforms} + + if os_type: + for platform_group in [self.desktop_platforms, self.mobile_platforms]: + if os_type in platform_group: + platforms = {os_type: platform_group[os_type]} + break + + os_key = random.choice(list(platforms.keys())) + if device_brand and device_brand in platforms[os_key]: + return platforms[os_key][device_brand] + return random.choice(list(platforms[os_key].values())) + + def parse_user_agent(self, user_agent: str) -> Dict[str, str]: + """Parse a user agent string to extract browser and version information""" + browsers = { + 'chrome': r'Chrome/(\d+)', + 'edge': r'Edg/(\d+)', + 'safari': r'Version/(\d+)', + 'firefox': r'Firefox/(\d+)' + } + + result = {} + for browser, pattern in browsers.items(): + match = re.search(pattern, user_agent) + if match: + result[browser] = match.group(1) + + return result + + def generate_client_hints(self, user_agent: str) -> str: + """Generate Sec-CH-UA header value based on user agent string""" + browsers = self.parse_user_agent(user_agent) + + # Client hints components + hints = [] + + # Handle different browser combinations + if 'chrome' in browsers: + hints.append(f'"Chromium";v="{browsers["chrome"]}"') + hints.append('"Not_A Brand";v="8"') + + if 'edge' in browsers: + hints.append(f'"Microsoft Edge";v="{browsers["edge"]}"') + else: + hints.append(f'"Google Chrome";v="{browsers["chrome"]}"') + + elif 'firefox' in browsers: + # Firefox doesn't typically send Sec-CH-UA + return '""' + + elif 'safari' in browsers: + # Safari's format for client hints + hints.append(f'"Safari";v="{browsers["safari"]}"') + hints.append('"Not_A Brand";v="8"') + + return ', '.join(hints) + +# Example usage: +if __name__ == "__main__": + generator = UserAgentGenerator() + + print("\nSingle browser (Chrome):") + print(generator.generate(num_browsers=1, browser_type='chrome')) + + print("\nTwo browsers (Gecko/Firefox):") + print(generator.generate(num_browsers=2)) + + print("\nThree browsers (Chrome/Safari/Edge):") + print(generator.generate(num_browsers=3)) + + print("\nFirefox on Linux:") + print(generator.generate( + device_type='desktop', + os_type='linux', + browser_type='firefox', + num_browsers=2 + )) + + print("\nChrome/Safari/Edge on Windows:") + print(generator.generate( + device_type='desktop', + os_type='windows', + num_browsers=3 + )) \ No newline at end of file From 80d58ad24c64e30ab0c037496de89952516b772e Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 30 Nov 2024 22:00:15 +0800 Subject: [PATCH 47/70] bump version to 0.3.747 --- crawl4ai/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 4a938b75..189a2955 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.746" +__version__ = "0.3.747" From 293f299c083aab97aa06e8a06045caa7273aae15 Mon Sep 17 00:00:00 2001 From: unclecode Date: Sun, 1 Dec 2024 19:17:33 +0800 Subject: [PATCH 48/70] Add PruningContentFilter with unit tests and update documentation - Introduced the PruningContentFilter for better content relevance. - Implemented comprehensive unit tests for verification of functionality. - Enhanced existing BM25ContentFilter tests for edge case coverage. - Updated documentation to include usage examples for new filter. --- CHANGELOG.md | 50 +++ README.md | 7 +- crawl4ai/content_filter_strategy.py | 285 ++++++++++-------- crawl4ai/content_scraping_strategy.py | 13 +- docs/examples/quickstart_async.py | 8 +- docs/md_v2/advanced/managed_browser.md | 54 +++- docs/md_v2/basic/content_filtering.md | 58 +++- ..._filter.py => test_content_filter_bm25.py} | 0 tests/async/test_content_filter_prune.py | 159 ++++++++++ 9 files changed, 499 insertions(+), 135 deletions(-) rename tests/async/{test_content_filter.py => test_content_filter_bm25.py} (100%) create mode 100644 tests/async/test_content_filter_prune.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 309218dc..03a7afb0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,55 @@ # Changelog +## [0.3.75] December 1, 2024 + +### PruningContentFilter + +#### 1. Introduced PruningContentFilter (Dec 01, 2024) (Dec 01, 2024) +A new content filtering strategy that removes less relevant nodes based on metrics like text and link density. + +**Affected Files:** +- `crawl4ai/content_filter_strategy.py`: Enhancement of content filtering capabilities. +```diff +Implemented effective pruning algorithm with comprehensive scoring. +``` +- `README.md`: Improved documentation regarding new features. +```diff +Updated to include usage and explanation for the PruningContentFilter. +``` +- `docs/md_v2/basic/content_filtering.md`: Expanded documentation for users. +```diff +Added detailed section explaining the PruningContentFilter. +``` + +#### 2. Added Unit Tests for PruningContentFilter (Dec 01, 2024) (Dec 01, 2024) +Comprehensive tests added to ensure correct functionality of PruningContentFilter + +**Affected Files:** +- `tests/async/test_content_filter_prune.py`: Increased test coverage for content filtering strategies. +```diff +Created test cases for various scenarios using the PruningContentFilter. +``` + +### Development Updates + +#### 3. Enhanced BM25ContentFilter tests (Dec 01, 2024) (Dec 01, 2024) +Extended testing to cover additional edge cases and performance metrics. + +**Affected Files:** +- `tests/async/test_content_filter_bm25.py`: Improved reliability and performance assurance. +```diff +Added tests for new extraction scenarios including malformed HTML. +``` + +### Infrastructure & Documentation + +#### 4. Updated Examples (Dec 01, 2024) (Dec 01, 2024) +Altered examples in documentation to promote the use of PruningContentFilter alongside existing strategies. + +**Affected Files:** +- `docs/examples/quickstart_async.py`: Enhanced usability and clarity for new users. +- Revised example to illustrate usage of PruningContentFilter. + ## [0.3.746] November 29, 2024 ### Major Features diff --git a/README.md b/README.md index 405c1002..d70af8ad 100644 --- a/README.md +++ b/README.md @@ -422,7 +422,7 @@ You can check the project structure in the directory [https://github.com/uncleco ```python import asyncio from crawl4ai import AsyncWebCrawler, CacheMode -from crawl4ai.content_filter_strategy import BM25ContentFilter +from crawl4ai.content_filter_strategy import PruningContentFilter, BM25ContentFilter from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator async def main(): @@ -434,8 +434,11 @@ async def main(): url="https://docs.micronaut.io/4.7.6/guide/", cache_mode=CacheMode.ENABLED, markdown_generator=DefaultMarkdownGenerator( - content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0) + content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0) ), + # markdown_generator=DefaultMarkdownGenerator( + # content_filter=BM25ContentFilter(user_query="WHEN_WE_FOCUS_BASED_ON_A_USER_QUERY", bm25_threshold=1.0) + # ), ) print(len(result.markdown)) print(len(result.fit_markdown)) diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py index e6891a3f..ca3868bb 100644 --- a/crawl4ai/content_filter_strategy.py +++ b/crawl4ai/content_filter_strategy.py @@ -4,10 +4,10 @@ from typing import List, Tuple, Dict from rank_bm25 import BM25Okapi from time import perf_counter from collections import deque -from bs4 import BeautifulSoup, NavigableString, Tag +from bs4 import BeautifulSoup, NavigableString, Tag, Comment from .utils import clean_tokens from abc import ABC, abstractmethod - +import math from snowballstemmer import stemmer @@ -358,145 +358,186 @@ class BM25ContentFilter(RelevantContentFilter): return [self.clean_element(tag) for _, _, tag in selected_candidates] -class HeuristicContentFilter(RelevantContentFilter): - def __init__(self): - super().__init__() - # Weights for different heuristics - self.tag_weights = { - 'article': 10, - 'main': 8, - 'section': 5, - 'div': 3, - 'p': 2, - 'pre': 2, - 'code': 2, - 'blockquote': 2, - 'li': 1, - 'span': 1, - } - self.max_depth = 5 # Maximum depth from body to consider - def filter_content(self, html: str) -> List[str]: - """Implements heuristic content filtering without relying on a query.""" + + + +class PruningContentFilter(RelevantContentFilter): + def __init__(self, user_query: str = None, min_word_threshold: int = None, + threshold_type: str = 'fixed', threshold: float = 0.48): + super().__init__(user_query) + self.min_word_threshold = min_word_threshold + self.threshold_type = threshold_type + self.threshold = threshold + + # Add tag importance for dynamic threshold + self.tag_importance = { + 'article': 1.5, + 'main': 1.4, + 'section': 1.3, + 'p': 1.2, + 'h1': 1.4, + 'h2': 1.3, + 'h3': 1.2, + 'div': 0.7, + 'span': 0.6 + } + + # Metric configuration + self.metric_config = { + 'text_density': True, + 'link_density': True, + 'tag_weight': True, + 'class_id_weight': True, + 'text_length': True, + } + + self.metric_weights = { + 'text_density': 0.4, + 'link_density': 0.2, + 'tag_weight': 0.2, + 'class_id_weight': 0.1, + 'text_length': 0.1, + } + + self.tag_weights = { + 'div': 0.5, + 'p': 1.0, + 'article': 1.5, + 'section': 1.0, + 'span': 0.3, + 'li': 0.5, + 'ul': 0.5, + 'ol': 0.5, + 'h1': 1.2, + 'h2': 1.1, + 'h3': 1.0, + 'h4': 0.9, + 'h5': 0.8, + 'h6': 0.7, + } + + def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]: if not html or not isinstance(html, str): return [] - + soup = BeautifulSoup(html, 'lxml') - - # Ensure there is a body tag if not soup.body: soup = BeautifulSoup(f'{html}', 'lxml') - body = soup.body + + # Remove comments and unwanted tags + self._remove_comments(soup) + self._remove_unwanted_tags(soup) + + # Prune tree starting from body + body = soup.find('body') + self._prune_tree(body) + + # Extract remaining content as list of HTML strings + content_blocks = [] + for element in body.children: + if isinstance(element, str) or not hasattr(element, 'name'): + continue + if len(element.get_text(strip=True)) > 0: + content_blocks.append(str(element)) + + return content_blocks - # Extract candidate text chunks - candidates = self.extract_text_chunks(body) + def _remove_comments(self, soup): + for element in soup(text=lambda text: isinstance(text, Comment)): + element.extract() - if not candidates: - return [] + def _remove_unwanted_tags(self, soup): + for tag in self.excluded_tags: + for element in soup.find_all(tag): + element.decompose() - # Score each candidate - scored_candidates = [] - for index, text, tag_type, tag in candidates: - score = self.score_element(tag, text) - if score > 0: - scored_candidates.append((score, index, text, tag)) + def _prune_tree(self, node): + if not node or not hasattr(node, 'name') or node.name is None: + return - # Sort candidates by score and then by document order - scored_candidates.sort(key=lambda x: (-x[0], x[1])) + text_len = len(node.get_text(strip=True)) + tag_len = len(node.encode_contents().decode('utf-8')) + link_text_len = sum(len(s.strip()) for s in (a.string for a in node.find_all('a', recursive=False)) if s) - # Extract the top candidates (e.g., top 5) - top_candidates = scored_candidates[:5] # Adjust the number as needed + metrics = { + 'node': node, + 'tag_name': node.name, + 'text_len': text_len, + 'tag_len': tag_len, + 'link_text_len': link_text_len + } - # Sort the top candidates back to their original document order - top_candidates.sort(key=lambda x: x[1]) + score = self._compute_composite_score(metrics) - # Clean and return the content - return [self.clean_element(tag) for _, _, _, tag in top_candidates] + if self.threshold_type == 'fixed': + should_remove = score < self.threshold + else: # dynamic + tag_importance = self.tag_importance.get(node.name, 0.7) + text_ratio = text_len / tag_len if tag_len > 0 else 0 + link_ratio = link_text_len / text_len if text_len > 0 else 1 + + threshold = self.threshold # base threshold + if tag_importance > 1: + threshold *= 0.8 + if text_ratio > 0.4: + threshold *= 0.9 + if link_ratio > 0.6: + threshold *= 1.2 + + should_remove = score < threshold - def score_element(self, tag: Tag, text: str) -> float: - """Compute a score for an element based on heuristics.""" - if not text or not tag: - return 0 + if should_remove: + node.decompose() + else: + children = [child for child in node.children if hasattr(child, 'name')] + for child in children: + self._prune_tree(child) - # Exclude unwanted tags - if self.is_excluded(tag): - return 0 + def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len): + if self.min_word_threshold: + # Get raw text from metrics node - avoid extra processing + text = metrics['node'].get_text(strip=True) + word_count = text.count(' ') + 1 + if word_count < self.min_word_threshold: + return -1.0 # Guaranteed removal + score = 0.0 + total_weight = 0.0 - # Text density - text_length = len(text.strip()) - html_length = len(str(tag)) - text_density = text_length / html_length if html_length > 0 else 0 + if self.metric_config['text_density']: + density = text_len / tag_len if tag_len > 0 else 0 + score += self.metric_weights['text_density'] * density + total_weight += self.metric_weights['text_density'] - # Link density - link_text_length = sum(len(a.get_text().strip()) for a in tag.find_all('a')) - link_density = link_text_length / text_length if text_length > 0 else 0 + if self.metric_config['link_density']: + density = 1 - (link_text_len / text_len if text_len > 0 else 0) + score += self.metric_weights['link_density'] * density + total_weight += self.metric_weights['link_density'] - # Tag weight - tag_weight = self.tag_weights.get(tag.name, 1) + if self.metric_config['tag_weight']: + tag_score = self.tag_weights.get(metrics['tag_name'], 0.5) + score += self.metric_weights['tag_weight'] * tag_score + total_weight += self.metric_weights['tag_weight'] - # Depth factor (prefer elements closer to the body tag) - depth = self.get_depth(tag) - depth_weight = max(self.max_depth - depth, 1) / self.max_depth + if self.metric_config['class_id_weight']: + class_score = self._compute_class_id_weight(metrics['node']) + score += self.metric_weights['class_id_weight'] * max(0, class_score) + total_weight += self.metric_weights['class_id_weight'] - # Compute the final score - score = (text_density * tag_weight * depth_weight) / (1 + link_density) + if self.metric_config['text_length']: + score += self.metric_weights['text_length'] * math.log(text_len + 1) + total_weight += self.metric_weights['text_length'] - return score + return score / total_weight if total_weight > 0 else 0 - def get_depth(self, tag: Tag) -> int: - """Compute the depth of the tag from the body tag.""" - depth = 0 - current = tag - while current and current != current.parent and current.name != 'body': - current = current.parent - depth += 1 - return depth - - def extract_text_chunks(self, body: Tag) -> List[Tuple[int, str, str, Tag]]: - """ - Extracts text chunks from the body element while preserving order. - Returns list of tuples (index, text, tag_type, tag) for scoring. - """ - chunks = [] - index = 0 - - def traverse(element): - nonlocal index - if isinstance(element, NavigableString): - return - if not isinstance(element, Tag): - return - if self.is_excluded(element): - return - # Only consider included tags - if element.name in self.included_tags: - text = element.get_text(separator=' ', strip=True) - if len(text.split()) >= self.min_word_count: - tag_type = 'header' if element.name in self.header_tags else 'content' - chunks.append((index, text, tag_type, element)) - index += 1 - # Do not traverse children of this element to prevent duplication - return - for child in element.children: - traverse(child) - - traverse(body) - return chunks - - def is_excluded(self, tag: Tag) -> bool: - """Determine if a tag should be excluded based on heuristics.""" - if tag.name in self.excluded_tags: - return True - class_id = ' '.join(filter(None, [ - ' '.join(tag.get('class', [])), - tag.get('id', '') - ])) - if self.negative_patterns.search(class_id): - return True - # Exclude tags with high link density (e.g., navigation menus) - text = tag.get_text(separator=' ', strip=True) - link_text_length = sum(len(a.get_text(strip=True)) for a in tag.find_all('a')) - text_length = len(text) - if text_length > 0 and (link_text_length / text_length) > 0.5: - return True - return False + def _compute_class_id_weight(self, node): + class_id_score = 0 + if 'class' in node.attrs: + classes = ' '.join(node['class']) + if self.negative_patterns.match(classes): + class_id_score -= 0.5 + if 'id' in node.attrs: + element_id = node['id'] + if self.negative_patterns.match(element_id): + class_id_score -= 0.5 + return class_id_score \ No newline at end of file diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index ec6c3361..de8894b7 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -9,7 +9,7 @@ from bs4 import element, NavigableString, Comment from urllib.parse import urljoin from requests.exceptions import InvalidSchema # from .content_cleaning_strategy import ContentCleaningStrategy -from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter +from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter, PruningContentFilter from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator from .models import MarkdownGenerationResult from .utils import ( @@ -110,10 +110,15 @@ class WebScrapingStrategy(ContentScrapingStrategy): if markdown_generator: try: if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter: - markdown_generator.content_filter = BM25ContentFilter( - user_query=kwargs.get('fit_markdown_user_query', None), - bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) + markdown_generator.content_filter = PruningContentFilter( + threshold_type=kwargs.get('fit_markdown_treshold_type', 'fixed'), + threshold=kwargs.get('fit_markdown_treshold', 0.48), + min_word_threshold=kwargs.get('fit_markdown_min_word_threshold', ), ) + # markdown_generator.content_filter = BM25ContentFilter( + # user_query=kwargs.get('fit_markdown_user_query', None), + # bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) + # ) markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( cleaned_html=cleaned_html, diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 679a9bc2..73d695c3 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -15,7 +15,7 @@ from bs4 import BeautifulSoup from pydantic import BaseModel, Field from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator -from crawl4ai.content_filter_strategy import BM25ContentFilter +from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter from crawl4ai.extraction_strategy import ( JsonCssExtractionStrategy, LLMExtractionStrategy, @@ -466,7 +466,8 @@ async def speed_comparison(): url="https://www.nbcnews.com/business", word_count_threshold=0, markdown_generator=DefaultMarkdownGenerator( - content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0) + content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0) + # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0) ), cache_mode=CacheMode.BYPASS, verbose=False, @@ -489,7 +490,8 @@ async def speed_comparison(): word_count_threshold=0, cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator( - content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0) + content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0) + # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0) ), verbose=False, ) diff --git a/docs/md_v2/advanced/managed_browser.md b/docs/md_v2/advanced/managed_browser.md index 80d6fc1a..0d327f2e 100644 --- a/docs/md_v2/advanced/managed_browser.md +++ b/docs/md_v2/advanced/managed_browser.md @@ -4,7 +4,59 @@ This guide explains how to use content filtering strategies in Crawl4AI to extra ## Relevance Content Filter -The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks. +The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `PruningContentFilter` or `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks. + + +## Pruning Content Filter + +The `PruningContentFilter` is a tree-shaking algorithm that analyzes the HTML DOM structure and removes less relevant nodes based on various metrics like text density, link density, and tag importance. It evaluates each node using a composite scoring system and "prunes" nodes that fall below a certain threshold. + +### Usage + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.content_filter_strategy import PruningContentFilter + +async def filter_content(url): + async with AsyncWebCrawler() as crawler: + content_filter = PruningContentFilter( + min_word_threshold=5, + threshold_type='dynamic', + threshold=0.45 + ) + result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) + if result.success: + print(f"Cleaned Markdown:\n{result.fit_markdown}") +``` + +### Parameters + +- **`min_word_threshold`**: (Optional) Minimum number of words a node must contain to be considered relevant. Nodes with fewer words are automatically pruned. + +- **`threshold_type`**: (Optional, default 'fixed') Controls how pruning thresholds are calculated: + - `'fixed'`: Uses a constant threshold value for all nodes + - `'dynamic'`: Adjusts threshold based on node characteristics like tag importance and text/link ratios + +- **`threshold`**: (Optional, default 0.48) Base threshold value for node pruning: + - For fixed threshold: Nodes scoring below this value are removed + - For dynamic threshold: This value is adjusted based on node properties + +### How It Works + +The pruning algorithm evaluates each node using multiple metrics: +- Text density: Ratio of actual text to overall node content +- Link density: Proportion of text within links +- Tag importance: Weight based on HTML tag type (e.g., article, p, div) +- Content quality: Metrics like text length and structural importance + +Nodes scoring below the threshold are removed, effectively "shaking" less relevant content from the DOM tree. This results in a cleaner document containing only the most relevant content blocks. + +The algorithm is particularly effective for: +- Removing boilerplate content +- Eliminating navigation menus and sidebars +- Preserving main article content +- Maintaining document structure while removing noise + ## BM25 Algorithm diff --git a/docs/md_v2/basic/content_filtering.md b/docs/md_v2/basic/content_filtering.md index 9506c075..0d327f2e 100644 --- a/docs/md_v2/basic/content_filtering.md +++ b/docs/md_v2/basic/content_filtering.md @@ -4,7 +4,59 @@ This guide explains how to use content filtering strategies in Crawl4AI to extra ## Relevance Content Filter -The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks. +The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `PruningContentFilter` or `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks. + + +## Pruning Content Filter + +The `PruningContentFilter` is a tree-shaking algorithm that analyzes the HTML DOM structure and removes less relevant nodes based on various metrics like text density, link density, and tag importance. It evaluates each node using a composite scoring system and "prunes" nodes that fall below a certain threshold. + +### Usage + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.content_filter_strategy import PruningContentFilter + +async def filter_content(url): + async with AsyncWebCrawler() as crawler: + content_filter = PruningContentFilter( + min_word_threshold=5, + threshold_type='dynamic', + threshold=0.45 + ) + result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) + if result.success: + print(f"Cleaned Markdown:\n{result.fit_markdown}") +``` + +### Parameters + +- **`min_word_threshold`**: (Optional) Minimum number of words a node must contain to be considered relevant. Nodes with fewer words are automatically pruned. + +- **`threshold_type`**: (Optional, default 'fixed') Controls how pruning thresholds are calculated: + - `'fixed'`: Uses a constant threshold value for all nodes + - `'dynamic'`: Adjusts threshold based on node characteristics like tag importance and text/link ratios + +- **`threshold`**: (Optional, default 0.48) Base threshold value for node pruning: + - For fixed threshold: Nodes scoring below this value are removed + - For dynamic threshold: This value is adjusted based on node properties + +### How It Works + +The pruning algorithm evaluates each node using multiple metrics: +- Text density: Ratio of actual text to overall node content +- Link density: Proportion of text within links +- Tag importance: Weight based on HTML tag type (e.g., article, p, div) +- Content quality: Metrics like text length and structural importance + +Nodes scoring below the threshold are removed, effectively "shaking" less relevant content from the DOM tree. This results in a cleaner document containing only the most relevant content blocks. + +The algorithm is particularly effective for: +- Removing boilerplate content +- Eliminating navigation menus and sidebars +- Preserving main article content +- Maintaining document structure while removing noise + ## BM25 Algorithm @@ -21,7 +73,7 @@ from crawl4ai.content_filter_strategy import BM25ContentFilter async def filter_content(url, query=None): async with AsyncWebCrawler() as crawler: content_filter = BM25ContentFilter(user_query=query) - result = await crawler.arun(url=url, content_filter=content_filter, fit_markdown=True) # Set fit_markdown flag to True to trigger BM25 filtering + result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) # Set fit_markdown flag to True to trigger BM25 filtering if result.success: print(f"Filtered Content (JSON):\n{result.extracted_content}") print(f"\nFiltered Markdown:\n{result.fit_markdown}") # New field in CrawlResult object @@ -71,7 +123,7 @@ class MyCustomFilter(RelevantContentFilter): async def custom_filter_demo(url: str): async with AsyncWebCrawler() as crawler: custom_filter = MyCustomFilter() - result = await crawler.arun(url, content_filter=custom_filter) + result = await crawler.arun(url, extraction_strategy=custom_filter) if result.success: print(result.extracted_content) diff --git a/tests/async/test_content_filter.py b/tests/async/test_content_filter_bm25.py similarity index 100% rename from tests/async/test_content_filter.py rename to tests/async/test_content_filter_bm25.py diff --git a/tests/async/test_content_filter_prune.py b/tests/async/test_content_filter_prune.py new file mode 100644 index 00000000..23b0fa3a --- /dev/null +++ b/tests/async/test_content_filter_prune.py @@ -0,0 +1,159 @@ +import os, sys +import pytest +from bs4 import BeautifulSoup + +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) + +from crawl4ai.content_filter_strategy import PruningContentFilter + +@pytest.fixture +def basic_html(): + return """ + + +
+

Main Article

+

This is a high-quality paragraph with substantial text content. It contains enough words to pass the threshold and has good text density without too many links. This kind of content should survive the pruning process.

+ + +
+ + + """ + +@pytest.fixture +def link_heavy_html(): + return """ + + +
+

Good content paragraph that should remain.

+ +
+ + + """ + +@pytest.fixture +def mixed_content_html(): + return """ + + +
+

Article Title

+

Short summary.

+
+

Long high-quality paragraph with substantial content that should definitely survive the pruning process. This content has good text density and proper formatting which makes it valuable for retention.

+
+
+

Short comment 1

+

Short comment 2

+
+
+ + + """ + +class TestPruningContentFilter: + def test_basic_pruning(self, basic_html): + """Test basic content pruning functionality""" + filter = PruningContentFilter(min_word_threshold=5) + contents = filter.filter_content(basic_html) + + combined_content = ' '.join(contents).lower() + assert "high-quality paragraph" in combined_content + assert "sidebar content" not in combined_content + assert "share buttons" not in combined_content + + def test_min_word_threshold(self, mixed_content_html): + """Test minimum word threshold filtering""" + filter = PruningContentFilter(min_word_threshold=10) + contents = filter.filter_content(mixed_content_html) + + combined_content = ' '.join(contents).lower() + assert "short summary" not in combined_content + assert "long high-quality paragraph" in combined_content + assert "short comment" not in combined_content + + def test_threshold_types(self, basic_html): + """Test fixed vs dynamic thresholds""" + fixed_filter = PruningContentFilter(threshold_type='fixed', threshold=0.48) + dynamic_filter = PruningContentFilter(threshold_type='dynamic', threshold=0.45) + + fixed_contents = fixed_filter.filter_content(basic_html) + dynamic_contents = dynamic_filter.filter_content(basic_html) + + assert len(fixed_contents) != len(dynamic_contents), \ + "Fixed and dynamic thresholds should yield different results" + + def test_link_density_impact(self, link_heavy_html): + """Test handling of link-heavy content""" + filter = PruningContentFilter(threshold_type='dynamic') + contents = filter.filter_content(link_heavy_html) + + combined_content = ' '.join(contents).lower() + assert "good content paragraph" in combined_content + assert len([c for c in contents if 'href' in c]) < 2, \ + "Should prune link-heavy sections" + + def test_tag_importance(self, mixed_content_html): + """Test tag importance in scoring""" + filter = PruningContentFilter(threshold_type='dynamic') + contents = filter.filter_content(mixed_content_html) + + has_article = any('article' in c.lower() for c in contents) + has_h1 = any('h1' in c.lower() for c in contents) + assert has_article or has_h1, "Should retain important tags" + + def test_empty_input(self): + """Test handling of empty input""" + filter = PruningContentFilter() + assert filter.filter_content("") == [] + assert filter.filter_content(None) == [] + + def test_malformed_html(self): + """Test handling of malformed HTML""" + malformed_html = "
Unclosed div

Nestedcontent

" + filter = PruningContentFilter() + contents = filter.filter_content(malformed_html) + assert isinstance(contents, list) + + def test_performance(self, basic_html): + """Test performance with timer""" + filter = PruningContentFilter() + + import time + start = time.perf_counter() + filter.filter_content(basic_html) + duration = time.perf_counter() - start + + # Extra strict on performance since you mentioned milliseconds matter + assert duration < 0.1, f"Processing took too long: {duration:.3f} seconds" + + @pytest.mark.parametrize("threshold,expected_count", [ + (0.3, 4), # Very lenient + (0.48, 2), # Default + (0.7, 1), # Very strict + ]) + def test_threshold_levels(self, mixed_content_html, threshold, expected_count): + """Test different threshold levels""" + filter = PruningContentFilter(threshold_type='fixed', threshold=threshold) + contents = filter.filter_content(mixed_content_html) + assert len(contents) <= expected_count, \ + f"Expected {expected_count} or fewer elements with threshold {threshold}" + + def test_consistent_output(self, basic_html): + """Test output consistency across multiple runs""" + filter = PruningContentFilter() + first_run = filter.filter_content(basic_html) + second_run = filter.filter_content(basic_html) + assert first_run == second_run, "Output should be consistent" + +if __name__ == "__main__": + pytest.main([__file__]) \ No newline at end of file From 95a4f74d2a9c0ae8c6f727cce6f6d0c17694aeb4 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 2 Dec 2024 20:37:28 +0800 Subject: [PATCH 49/70] fix: pass logger to WebScrapingStrategy and update score computation in PruningContentFilter --- crawl4ai/async_webcrawler.py | 4 +++- crawl4ai/content_filter_strategy.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 66b4c21b..8db69333 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -472,7 +472,9 @@ class AsyncWebCrawler: try: _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" t1 = time.perf_counter() - scrapping_strategy = WebScrapingStrategy() + scrapping_strategy = WebScrapingStrategy( + logger=self.logger, + ) # result = await scrapping_strategy.ascrap( result = scrapping_strategy.scrap( url, diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py index ca3868bb..f05b92fa 100644 --- a/crawl4ai/content_filter_strategy.py +++ b/crawl4ai/content_filter_strategy.py @@ -468,7 +468,7 @@ class PruningContentFilter(RelevantContentFilter): 'link_text_len': link_text_len } - score = self._compute_composite_score(metrics) + score = self._compute_composite_score(metrics, text_len, tag_len, link_text_len) if self.threshold_type == 'fixed': should_remove = score < self.threshold From e9639ad18972d11929823ff9b1bb9794ad938750 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 3 Dec 2024 19:44:38 +0800 Subject: [PATCH 50/70] refactor: improve error handling in DataProcessor and optimize data parsing logic --- crawl4ai/async_webcrawler.py | 305 ++++++++++++++++++----------------- 1 file changed, 155 insertions(+), 150 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 8db69333..2c17602d 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -7,6 +7,7 @@ from pathlib import Path from typing import Optional, List, Union import json import asyncio +from contextlib import nullcontext from .models import CrawlResult, MarkdownGenerationResult from .async_database import async_db_manager from .chunking_strategy import * @@ -67,6 +68,7 @@ class AsyncWebCrawler: always_bypass_cache: bool = False, always_by_pass_cache: Optional[bool] = None, # Deprecated parameter base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), + thread_safe: bool = False, **kwargs, ): """ @@ -104,6 +106,8 @@ class AsyncWebCrawler: else: self.always_bypass_cache = always_bypass_cache + self._lock = asyncio.Lock() if thread_safe else None + self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai") os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) @@ -178,169 +182,170 @@ class AsyncWebCrawler: Returns: CrawlResult: The result of crawling and processing """ - try: - # Handle deprecated parameters - if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): - if kwargs.get("warning", True): - warnings.warn( - "Cache control boolean flags are deprecated and will be removed in version X.X.X. " - "Use 'cache_mode' parameter instead. Examples:\n" - "- For bypass_cache=True, use cache_mode=CacheMode.BYPASS\n" - "- For disable_cache=True, use cache_mode=CacheMode.DISABLED\n" - "- For no_cache_read=True, use cache_mode=CacheMode.WRITE_ONLY\n" - "- For no_cache_write=True, use cache_mode=CacheMode.READ_ONLY\n" - "Pass warning=False to suppress this warning.", - DeprecationWarning, - stacklevel=2 - ) + async with self._lock or nullcontext(): + try: + # Handle deprecated parameters + if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): + if kwargs.get("warning", True): + warnings.warn( + "Cache control boolean flags are deprecated and will be removed in version X.X.X. " + "Use 'cache_mode' parameter instead. Examples:\n" + "- For bypass_cache=True, use cache_mode=CacheMode.BYPASS\n" + "- For disable_cache=True, use cache_mode=CacheMode.DISABLED\n" + "- For no_cache_read=True, use cache_mode=CacheMode.WRITE_ONLY\n" + "- For no_cache_write=True, use cache_mode=CacheMode.READ_ONLY\n" + "Pass warning=False to suppress this warning.", + DeprecationWarning, + stacklevel=2 + ) + + # Convert legacy parameters if cache_mode not provided + if cache_mode is None: + cache_mode = _legacy_to_cache_mode( + disable_cache=disable_cache, + bypass_cache=bypass_cache, + no_cache_read=no_cache_read, + no_cache_write=no_cache_write + ) - # Convert legacy parameters if cache_mode not provided + # Default to ENABLED if no cache mode specified if cache_mode is None: - cache_mode = _legacy_to_cache_mode( - disable_cache=disable_cache, - bypass_cache=bypass_cache, - no_cache_read=no_cache_read, - no_cache_write=no_cache_write + cache_mode = CacheMode.ENABLED + + # Create cache context + cache_context = CacheContext(url, cache_mode, self.always_bypass_cache) + + extraction_strategy = extraction_strategy or NoExtractionStrategy() + extraction_strategy.verbose = verbose + if not isinstance(extraction_strategy, ExtractionStrategy): + raise ValueError("Unsupported extraction strategy") + if not isinstance(chunking_strategy, ChunkingStrategy): + raise ValueError("Unsupported chunking strategy") + + word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD) + + async_response: AsyncCrawlResponse = None + cached_result = None + screenshot_data = None + extracted_content = None + + start_time = time.perf_counter() + + # Try to get cached result if appropriate + if cache_context.should_read(): + cached_result = await async_db_manager.aget_cached_url(url) + + if cached_result: + html = sanitize_input_encode(cached_result.html) + extracted_content = sanitize_input_encode(cached_result.extracted_content or "") + if screenshot: + screenshot_data = cached_result.screenshot + if not screenshot_data: + cached_result = None + # if verbose: + # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s") + self.logger.url_status( + url=cache_context.display_url, + success=bool(html), + timing=time.perf_counter() - start_time, + tag="FETCH" + ) + + + # Fetch fresh content if needed + if not cached_result or not html: + t1 = time.perf_counter() + + if user_agent: + self.crawler_strategy.update_user_agent(user_agent) + async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl( + url, + screenshot=screenshot, + **kwargs ) - - # Default to ENABLED if no cache mode specified - if cache_mode is None: - cache_mode = CacheMode.ENABLED - - # Create cache context - cache_context = CacheContext(url, cache_mode, self.always_bypass_cache) - - extraction_strategy = extraction_strategy or NoExtractionStrategy() - extraction_strategy.verbose = verbose - if not isinstance(extraction_strategy, ExtractionStrategy): - raise ValueError("Unsupported extraction strategy") - if not isinstance(chunking_strategy, ChunkingStrategy): - raise ValueError("Unsupported chunking strategy") - - word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD) - - async_response: AsyncCrawlResponse = None - cached_result = None - screenshot_data = None - extracted_content = None - - start_time = time.perf_counter() - - # Try to get cached result if appropriate - if cache_context.should_read(): - cached_result = await async_db_manager.aget_cached_url(url) - - if cached_result: - html = sanitize_input_encode(cached_result.html) - extracted_content = sanitize_input_encode(cached_result.extracted_content or "") - if screenshot: - screenshot_data = cached_result.screenshot - if not screenshot_data: - cached_result = None - # if verbose: - # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s") - self.logger.url_status( + html = sanitize_input_encode(async_response.html) + screenshot_data = async_response.screenshot + t2 = time.perf_counter() + self.logger.url_status( url=cache_context.display_url, success=bool(html), - timing=time.perf_counter() - start_time, + timing=t2 - t1, tag="FETCH" - ) + ) + # if verbose: + # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") - - # Fetch fresh content if needed - if not cached_result or not html: - t1 = time.perf_counter() + # Process the HTML content + crawl_result = await self.aprocess_html( + url=url, + html=html, + extracted_content=extracted_content, + word_count_threshold=word_count_threshold, + extraction_strategy=extraction_strategy, + chunking_strategy=chunking_strategy, + content_filter=content_filter, + css_selector=css_selector, + screenshot=screenshot_data, + verbose=verbose, + is_cached=bool(cached_result), + async_response=async_response, + is_web_url=cache_context.is_web_url, + is_local_file=cache_context.is_local_file, + is_raw_html=cache_context.is_raw_html, + **kwargs, + ) - if user_agent: - self.crawler_strategy.update_user_agent(user_agent) - async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl( - url, - screenshot=screenshot, - **kwargs - ) - html = sanitize_input_encode(async_response.html) - screenshot_data = async_response.screenshot - t2 = time.perf_counter() - self.logger.url_status( - url=cache_context.display_url, - success=bool(html), - timing=t2 - t1, - tag="FETCH" - ) + # Set response data + if async_response: + crawl_result.status_code = async_response.status_code + crawl_result.response_headers = async_response.response_headers + crawl_result.downloaded_files = async_response.downloaded_files + else: + crawl_result.status_code = 200 + crawl_result.response_headers = cached_result.response_headers if cached_result else {} + + crawl_result.success = bool(html) + crawl_result.session_id = kwargs.get("session_id", None) + # if verbose: - # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") + # print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") + self.logger.success( + message="{url:.50}... | Status: {status} | Total: {timing}", + tag="COMPLETE", + params={ + "url": cache_context.display_url, + "status": crawl_result.success, + "timing": f"{time.perf_counter() - start_time:.2f}s" + }, + colors={ + "status": Fore.GREEN if crawl_result.success else Fore.RED, + "timing": Fore.YELLOW + } + ) - # Process the HTML content - crawl_result = await self.aprocess_html( - url=url, - html=html, - extracted_content=extracted_content, - word_count_threshold=word_count_threshold, - extraction_strategy=extraction_strategy, - chunking_strategy=chunking_strategy, - content_filter=content_filter, - css_selector=css_selector, - screenshot=screenshot_data, - verbose=verbose, - is_cached=bool(cached_result), - async_response=async_response, - is_web_url=cache_context.is_web_url, - is_local_file=cache_context.is_local_file, - is_raw_html=cache_context.is_raw_html, - **kwargs, - ) + # Update cache if appropriate + if cache_context.should_write() and not bool(cached_result): + await async_db_manager.acache_url(crawl_result) + + return crawl_result - # Set response data - if async_response: - crawl_result.status_code = async_response.status_code - crawl_result.response_headers = async_response.response_headers - crawl_result.downloaded_files = async_response.downloaded_files - else: - crawl_result.status_code = 200 - crawl_result.response_headers = cached_result.response_headers if cached_result else {} - - crawl_result.success = bool(html) - crawl_result.session_id = kwargs.get("session_id", None) - - # if verbose: - # print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") - self.logger.success( - message="{url:.50}... | Status: {status} | Total: {timing}", - tag="COMPLETE", - params={ - "url": cache_context.display_url, - "status": crawl_result.success, - "timing": f"{time.perf_counter() - start_time:.2f}s" - }, - colors={ - "status": Fore.GREEN if crawl_result.success else Fore.RED, - "timing": Fore.YELLOW - } + except Exception as e: + if not hasattr(e, "msg"): + e.msg = str(e) + # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") + + self.logger.error_status( + url=cache_context.display_url, + error=create_box_message(e.msg, type = "error"), + tag="ERROR" + ) + return CrawlResult( + url=url, + html="", + success=False, + error_message=e.msg ) - # Update cache if appropriate - if cache_context.should_write() and not bool(cached_result): - await async_db_manager.acache_url(crawl_result) - - return crawl_result - - except Exception as e: - if not hasattr(e, "msg"): - e.msg = str(e) - # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") - - self.logger.error_status( - url=cache_context.display_url, - error=create_box_message(e.msg, type = "error"), - tag="ERROR" - ) - return CrawlResult( - url=url, - html="", - success=False, - error_message=e.msg - ) - async def arun_many( self, urls: List[str], From b02544bc0bf1dac897adec6bb0de730e5b7f3ccd Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 3 Dec 2024 21:28:52 +0800 Subject: [PATCH 51/70] docs: update README and blog for version 0.4.0 release, highlighting new features and improvements --- README.md | 22 +++++++---- docs/md_v2/blog/index.md | 28 ++++++++++++++ docs/md_v2/blog/releases/0.4.0.md | 62 +++++++++++++++++++++++++++++++ mkdocs.yml | 14 ++++--- 4 files changed, 113 insertions(+), 13 deletions(-) create mode 100644 docs/md_v2/blog/index.md create mode 100644 docs/md_v2/blog/releases/0.4.0.md diff --git a/README.md b/README.md index d70af8ad..cbeb4067 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,10 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -[✨ Check out latest update v0.3.745](#-recent-updates) + +🎉 **Version 0.4.0 is out!** Introducing our experimental PruningContentFilter - a powerful new algorithm for smarter Markdown generation. Test it out and [share your feedback](https://github.com/unclecode/crawl4ai/issues)! [Read the release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/blog/releases/0.4.0.md) + +[✨ Check out latest update v0.4.0](#-recent-updates) ## 🧐 Why Crawl4AI? @@ -623,18 +626,21 @@ async def test_news_crawl(): ## ✨ Recent Updates -- 🚀 **Improved ManagedBrowser Configuration**: Dynamic host and port support for more flexible browser management. -- 📝 **Enhanced Markdown Generation**: New generator class for better formatting and customization. -- ⚡ **Fast HTML Formatting**: Significantly optimized HTML formatting in the web crawler. -- 🛠️ **Utility & Sanitization Upgrades**: Improved sanitization and expanded utility functions for streamlined workflows. -- 👥 **Acknowledgments**: Added contributor details and pull request acknowledgments for better transparency. +- 🔬 **PruningContentFilter**: New unsupervised filtering strategy for intelligent content extraction based on text density and relevance scoring. +- 🧵 **Enhanced Thread Safety**: Improved multi-threaded environment handling with better locks and parallel processing support. +- 🤖 **Smart User-Agent Generation**: Advanced user-agent generator with customization options and randomization capabilities. +- 📝 **New Blog Launch**: Stay updated with our detailed release notes and technical deep dives at [crawl4ai.com/blog](https://crawl4ai.com/blog). +- 🧪 **Expanded Test Coverage**: Comprehensive test suite for both PruningContentFilter and BM25ContentFilter with edge case handling. +Read the full details of this release in our [0.4.0 Release Notes](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/blog/releases/0.4.0.md). ## 📖 Documentation & Roadmap -For detailed documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/). +> 🚨 **Documentation Update Alert**: We're undertaking a major documentation overhaul next week to reflect recent updates and improvements. Stay tuned for a more comprehensive and up-to-date guide! -Moreover to check our development plans and upcoming features, check out our [Roadmap](https://github.com/unclecode/crawl4ai/blob/main/ROADMAP.md). +For current documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/). + +To check our development plans and upcoming features, visit our [Roadmap](https://github.com/unclecode/crawl4ai/blob/main/ROADMAP.md).
📈 Development TODOs diff --git a/docs/md_v2/blog/index.md b/docs/md_v2/blog/index.md new file mode 100644 index 00000000..054b12f8 --- /dev/null +++ b/docs/md_v2/blog/index.md @@ -0,0 +1,28 @@ +# Crawl4AI Blog + +Welcome to the Crawl4AI blog! Here you'll find detailed release notes, technical deep dives, and news about the project. + +## Latest Release + +### [0.4.0 - Major Content Filtering Update](releases/0.4.0.md) +*December 1, 2024* + +Introducing significant improvements to content filtering, multi-threaded environment handling, and user-agent generation. This release features the new PruningContentFilter, enhanced thread safety, and improved test coverage. + +[Read full release notes →](releases/0.4.0.md) + +## Project History + +Want to see how we got here? Check out our [complete changelog](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md) covering all previous versions and the evolution of Crawl4AI. + +## Categories + +- [Technical Deep Dives](/blog/technical) - Coming soon +- [Tutorials & Guides](/blog/tutorials) - Coming soon +- [Community Updates](/blog/community) - Coming soon + +## Stay Updated + +- Star us on [GitHub](https://github.com/unclecode/crawl4ai) +- Follow [@unclecode](https://twitter.com/unclecode) on Twitter +- Join our community discussions on GitHub diff --git a/docs/md_v2/blog/releases/0.4.0.md b/docs/md_v2/blog/releases/0.4.0.md new file mode 100644 index 00000000..0e7ee5df --- /dev/null +++ b/docs/md_v2/blog/releases/0.4.0.md @@ -0,0 +1,62 @@ +# Release Summary for Version 0.4.0 (December 1, 2024) + +## Overview +The 0.4.0 release introduces significant improvements to content filtering, multi-threaded environment handling, user-agent generation, and test coverage. Key highlights include the introduction of the PruningContentFilter, designed to automatically identify and extract the most valuable parts of an HTML document, as well as enhancements to the BM25ContentFilter to extend its versatility and effectiveness. + +## Major Features and Enhancements + +### 1. PruningContentFilter +- Introduced a new unsupervised content filtering strategy that scores and prunes less relevant nodes in an HTML document based on metrics like text and link density. +- Focuses on retaining the most valuable parts of the content, making it highly effective for extracting relevant information from complex web pages. +- Fully documented with updated README and expanded user guides. + +### 2. User-Agent Generator +- Added a user-agent generator utility that resolves compatibility issues and supports customizable user-agent strings. +- By default, the generator randomizes user agents for each request, adding diversity, but users can customize it for tailored scenarios. + +### 3. Enhanced Thread Safety +- Improved handling of multi-threaded environments by adding better thread locks for parallel processing, ensuring consistency and stability when running multiple threads. + +### 4. Extended Content Filtering Strategies +- Users now have access to both the PruningContentFilter for unsupervised extraction and the BM25ContentFilter for supervised filtering based on user queries. +- Enhanced BM25ContentFilter with improved capabilities to process page titles, meta tags, and descriptions, allowing for more effective classification and clustering of text chunks. + +### 5. Documentation Updates +- Updated examples and tutorials to promote the use of the PruningContentFilter alongside the BM25ContentFilter, providing clear instructions for selecting the appropriate filter for each use case. + +### 6. Unit Test Enhancements +- Added unit tests for PruningContentFilter to ensure accuracy and reliability. +- Enhanced BM25ContentFilter tests to cover additional edge cases and performance metrics, particularly for malformed HTML inputs. + +## Revised Change Logs for Version 0.4.0 + +### PruningContentFilter (Dec 01, 2024) +- Introduced the PruningContentFilter to optimize content extraction by pruning less relevant HTML nodes. + - **Affected Files:** + - **crawl4ai/content_filter_strategy.py**: Added a scoring-based pruning algorithm. + - **README.md**: Updated to include PruningContentFilter usage. + - **docs/md_v2/basic/content_filtering.md**: Expanded user documentation, detailing the use and benefits of PruningContentFilter. + +### Unit Tests for PruningContentFilter (Dec 01, 2024) +- Added comprehensive unit tests for PruningContentFilter to ensure correctness and efficiency. + - **Affected Files:** + - **tests/async/test_content_filter_prune.py**: Created tests covering different pruning scenarios to ensure stability and correctness. + +### Enhanced BM25ContentFilter Tests (Dec 01, 2024) +- Expanded tests to cover additional extraction scenarios and performance metrics, improving robustness. + - **Affected Files:** + - **tests/async/test_content_filter_bm25.py**: Added tests for edge cases, including malformed HTML inputs. + +### Documentation and Example Updates (Dec 01, 2024) +- Revised examples to illustrate the use of PruningContentFilter alongside existing content filtering methods. + - **Affected Files:** + - **docs/examples/quickstart_async.py**: Enhanced example clarity and usability for new users. + +## Experimental Features +- The PruningContentFilter is still under experimental development, and we continue to gather feedback for further refinements. + +## Conclusion +This release significantly enhances the content extraction capabilities of Crawl4ai with the introduction of the PruningContentFilter, improved supervised filtering with BM25ContentFilter, and robust multi-threaded handling. Additionally, the user-agent generator provides much-needed versatility, resolving compatibility issues faced by many users. + +Users are encouraged to experiment with the new content filtering methods to determine which best suits their needs. + diff --git a/mkdocs.yml b/mkdocs.yml index 1b26b9df..4ba7c2a7 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -10,7 +10,11 @@ nav: - 'Installation': 'basic/installation.md' - 'Docker Deplotment': 'basic/docker-deploymeny.md' - 'Quick Start': 'basic/quickstart.md' - + - Changelog & Blog: + - 'Blog Home': 'blog/index.md' + - 'Latest (0.4.0)': 'blog/releases/0.4.0.md' + - 'Changelog': 'https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md' + - Basic: - 'Simple Crawling': 'basic/simple-crawling.md' - 'Output Formats': 'basic/output-formats.md' @@ -50,12 +54,12 @@ nav: - '5. Dynamic Content': 'tutorial/episode_05_JavaScript_Execution_and_Dynamic_Content_Handling.md' - '6. Magic Mode': 'tutorial/episode_06_Magic_Mode_and_Anti-Bot_Protection.md' - '7. Content Cleaning': 'tutorial/episode_07_Content_Cleaning_and_Fit_Markdown.md' - - '8. Media Handling': 'tutorial/episode_08_Media_Handling:_Images,_Videos,_and_Audio.md' + - '8. Media Handling': 'tutorial/episode_08_Media_Handling_Images_Videos_and_Audio.md' - '9. Link Analysis': 'tutorial/episode_09_Link_Analysis_and_Smart_Filtering.md' - '10. User Simulation': 'tutorial/episode_10_Custom_Headers,_Identity,_and_User_Simulation.md' - - '11.1. JSON CSS': 'tutorial/episode_11_1_Extraction_Strategies:_JSON_CSS.md' - - '11.2. LLM Strategy': 'tutorial/episode_11_2_Extraction_Strategies:_LLM.md' - - '11.3. Cosine Strategy': 'tutorial/episode_11_3_Extraction_Strategies:_Cosine.md' + - '11.1. JSON CSS': 'tutorial/episode_11_1_Extraction_Strategies_JSON_CSS.md' + - '11.2. LLM Strategy': 'tutorial/episode_11_2_Extraction_Strategies_LLM.md' + - '11.3. Cosine Strategy': 'tutorial/episode_11_3_Extraction_Strategies_Cosine.md' - '12. Session Crawling': 'tutorial/episode_12_Session-Based_Crawling_for_Dynamic_Websites.md' - '13. Text Chunking': 'tutorial/episode_13_Chunking_Strategies_for_Large_Text_Processing.md' - '14. Custom Workflows': 'tutorial/episode_14_Hooks_and_Custom_Workflow_with_AsyncWebCrawler.md' From 486db3a7713e6ffb22dc378c989b67bdc57fff74 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 4 Dec 2024 20:26:39 +0800 Subject: [PATCH 52/70] Updated to version 0.4.0 with new features - Enhanced error handling in async crawler. - Added flexible options in Markdown generation. - Updated user agent settings for improved reliability. - Reflected changes in documentation and examples. --- crawl4ai/__version__.py | 2 +- crawl4ai/async_crawler_strategy.py | 19 ++++++++- crawl4ai/markdown_generation_strategy.py | 12 ++++-- crawl4ai/user_agent_generator.py | 1 + docs/examples/quickstart_async.py | 51 +++++++++++++++++++----- 5 files changed, 69 insertions(+), 16 deletions(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 189a2955..6f8b06f4 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.747" +__version__ = "0.4.0" diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 3d24bd84..493597ea 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -6,6 +6,7 @@ from typing import Callable, Dict, Any, List, Optional, Awaitable import os, sys, shutil import tempfile, subprocess from playwright.async_api import async_playwright, Page, Browser, Error +from playwright.async_api import TimeoutError as PlaywrightTimeoutError from io import BytesIO from PIL import Image, ImageDraw, ImageFont from pathlib import Path @@ -223,6 +224,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.use_cached_html = use_cached_html self.user_agent = kwargs.get( "user_agent", + # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36" ) user_agenr_generator = UserAgentGenerator() @@ -941,11 +943,24 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): }); } """ + try: - await page.wait_for_load_state() + try: + await page.wait_for_load_state( + # state="load", + state="domcontentloaded", + timeout=5 + ) + except PlaywrightTimeoutError: + pass await page.evaluate(update_image_dimensions_js) except Exception as e: - raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}") + self.logger.error( + message="Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {error}", + tag="ERROR", + params={"error": str(e)} + ) + # raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}") # Wait a bit for any onload events to complete await page.wait_for_timeout(100) diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index f242054d..1e0ca664 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -11,8 +11,9 @@ LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)') class MarkdownGenerationStrategy(ABC): """Abstract base class for markdown generation strategies.""" - def __init__(self, content_filter: Optional[RelevantContentFilter] = None): + def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None): self.content_filter = content_filter + self.options = options or {} @abstractmethod def generate_markdown(self, @@ -27,8 +28,8 @@ class MarkdownGenerationStrategy(ABC): class DefaultMarkdownGenerator(MarkdownGenerationStrategy): """Default implementation of markdown generation strategy.""" - def __init__(self, content_filter: Optional[RelevantContentFilter] = None): - super().__init__(content_filter) + def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None): + super().__init__(content_filter, options) def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]: link_map = {} @@ -74,6 +75,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy): cleaned_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, + options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult: @@ -82,6 +84,10 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy): h = CustomHTML2Text() if html2text_options: h.update_params(**html2text_options) + elif options: + h.update_params(**options) + elif self.options: + h.update_params(**self.options) # Generate raw markdown raw_markdown = h.handle(cleaned_html) diff --git a/crawl4ai/user_agent_generator.py b/crawl4ai/user_agent_generator.py index 0a4df0bb..a1f3a49e 100644 --- a/crawl4ai/user_agent_generator.py +++ b/crawl4ai/user_agent_generator.py @@ -236,6 +236,7 @@ class UserAgentGenerator: # Example usage: if __name__ == "__main__": generator = UserAgentGenerator() + print(generator.generate()) print("\nSingle browser (Chrome):") print(generator.generate(num_browsers=1, browser_type='chrome')) diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 73d695c3..176b0ba7 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -547,19 +547,50 @@ async def generate_knowledge_graph(): f.write(result.extracted_content) async def fit_markdown_remove_overlay(): - async with AsyncWebCrawler(headless = False) as crawler: - url = "https://janineintheworld.com/places-to-visit-in-central-mexico" + async with AsyncWebCrawler( + headless=True, # Set to False to see what is happening + verbose=True, + user_agent_mode="random", + user_agent_generator_config={ + "device_type": "mobile", + "os_type": "android" + }, + ) as crawler: result = await crawler.arun( - url=url, + url='https://www.kidocode.com/degrees/technology', cache_mode=CacheMode.BYPASS, - word_count_threshold = 10, - remove_overlay_elements=True, - screenshot = True + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0), + options={ + "ignore_links": True + } + ), + # markdown_generator=DefaultMarkdownGenerator( + # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0), + # options={ + # "ignore_links": True + # } + # ), ) - # Save markdown to file - with open(os.path.join(__location__, "mexico_places.md"), "w") as f: - f.write(result.fit_markdown) - + + if result.success: + print(len(result.markdown_v2.raw_markdown)) + print(len(result.markdown_v2.markdown_with_citations)) + print(len(result.markdown_v2.fit_markdown)) + + # Save clean html + with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f: + f.write(result.cleaned_html) + + with open(os.path.join(__location__, "output/output_raw_markdown.md"), "w") as f: + f.write(result.markdown_v2.raw_markdown) + + with open(os.path.join(__location__, "output/output_markdown_with_citations.md"), "w") as f: + f.write(result.markdown_v2.markdown_with_citations) + + with open(os.path.join(__location__, "output/output_fit_markdown.md"), "w") as f: + f.write(result.markdown_v2.fit_markdown) + print("Done") From 8c611dcb4b83eb2ac2e8196ea950564804621a03 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 5 Dec 2024 22:33:47 +0800 Subject: [PATCH 53/70] Refactored web scraping components - Enhanced the web scraping strategy with new methods for optimized media handling. - Added new utility functions for better content processing. - Refined existing features for improved accuracy and efficiency in scraping tasks. - Introduced more robust filtering criteria for media elements. --- crawl4ai/content_scraping_strategy.py | 778 ++++++++++++-------------- crawl4ai/utils.py | 53 +- crawl4ai/utils.scraping.py | 0 docs/examples/quickstart_async.py | 7 +- 4 files changed, 408 insertions(+), 430 deletions(-) create mode 100644 crawl4ai/utils.scraping.py diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index de8894b7..970c40f0 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -6,10 +6,11 @@ from concurrent.futures import ThreadPoolExecutor import asyncio, requests, re, os from .config import * from bs4 import element, NavigableString, Comment +from bs4 import PageElement, Tag from urllib.parse import urljoin from requests.exceptions import InvalidSchema # from .content_cleaning_strategy import ContentCleaningStrategy -from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter, PruningContentFilter +from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator from .models import MarkdownGenerationResult from .utils import ( @@ -80,45 +81,21 @@ class WebScrapingStrategy(ContentScrapingStrategy): async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: return await asyncio.to_thread(self._get_content_of_website_optimized, url, html, **kwargs) - def _generate_markdown_content(self, cleaned_html: str, html: str, url: str, success: bool, **kwargs) -> Dict[str, Any]: - """Generate markdown content using either new strategy or legacy method. - - Args: - cleaned_html: Sanitized HTML content - html: Original HTML content - url: Base URL of the page - success: Whether scraping was successful - **kwargs: Additional options including: - - markdown_generator: Optional[MarkdownGenerationStrategy] - - html2text: Dict[str, Any] options for HTML2Text - - content_filter: Optional[RelevantContentFilter] - - fit_markdown: bool - - fit_markdown_user_query: Optional[str] - - fit_markdown_bm25_threshold: float - - Returns: - Dict containing markdown content in various formats - """ markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator()) if markdown_generator: try: if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter: - markdown_generator.content_filter = PruningContentFilter( - threshold_type=kwargs.get('fit_markdown_treshold_type', 'fixed'), - threshold=kwargs.get('fit_markdown_treshold', 0.48), - min_word_threshold=kwargs.get('fit_markdown_min_word_threshold', ), + markdown_generator.content_filter = BM25ContentFilter( + user_query=kwargs.get('fit_markdown_user_query', None), + bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) ) - # markdown_generator.content_filter = BM25ContentFilter( - # user_query=kwargs.get('fit_markdown_user_query', None), - # bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) - # ) markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( cleaned_html=cleaned_html, @@ -182,13 +159,335 @@ class WebScrapingStrategy(ContentScrapingStrategy): 'markdown_v2' : markdown_v2 } + def flatten_nested_elements(self, node): + if isinstance(node, NavigableString): + return node + if len(node.contents) == 1 and isinstance(node.contents[0], Tag) and node.contents[0].name == node.name: + return self.flatten_nested_elements(node.contents[0]) + node.contents = [self.flatten_nested_elements(child) for child in node.contents] + return node + + def find_closest_parent_with_useful_text(self, tag, **kwargs): + image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD) + current_tag = tag + while current_tag: + current_tag = current_tag.parent + # Get the text content of the parent tag + if current_tag: + text_content = current_tag.get_text(separator=' ',strip=True) + # Check if the text content has at least word_count_threshold + if len(text_content.split()) >= image_description_min_word_threshold: + return text_content + return None + + def remove_unwanted_attributes(self, element, important_attrs, keep_data_attributes=False): + attrs_to_remove = [] + for attr in element.attrs: + if attr not in important_attrs: + if keep_data_attributes: + if not attr.startswith('data-'): + attrs_to_remove.append(attr) + else: + attrs_to_remove.append(attr) + + for attr in attrs_to_remove: + del element[attr] + + def process_image(self, img, url, index, total_images, **kwargs): + parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w') + if ' ' in u else None} + for u in [f"http{p}" for p in s.split("http") if p]] + + # Constants for checks + classes_to_check = frozenset(['button', 'icon', 'logo']) + tags_to_check = frozenset(['button', 'input']) + + # Pre-fetch commonly used attributes + style = img.get('style', '') + alt = img.get('alt', '') + src = img.get('src', '') + data_src = img.get('data-src', '') + width = img.get('width') + height = img.get('height') + parent = img.parent + parent_classes = parent.get('class', []) + + # Quick validation checks + if ('display:none' in style or + parent.name in tags_to_check or + any(c in cls for c in parent_classes for cls in classes_to_check) or + any(c in src for c in classes_to_check) or + any(c in alt for c in classes_to_check)): + return None + + # Quick score calculation + score = 0 + if width and width.isdigit(): + width_val = int(width) + score += 1 if width_val > 150 else 0 + if height and height.isdigit(): + height_val = int(height) + score += 1 if height_val > 150 else 0 + if alt: + score += 1 + score += index/total_images < 0.5 + + image_format = '' + if "data:image/" in src: + image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0] + else: + image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0] + + if image_format in ('jpg', 'png', 'webp', 'avif'): + score += 1 + + if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD): + return None + + # Use set for deduplication + unique_urls = set() + image_variants = [] + + # Generate a unique group ID for this set of variants + group_id = index + + # Base image info template + image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD) + base_info = { + 'alt': alt, + 'desc': self.find_closest_parent_with_useful_text(img, **kwargs), + 'score': score, + 'type': 'image', + 'group_id': group_id # Group ID for this set of variants + } + + # Inline function for adding variants + def add_variant(src, width=None): + if src and not src.startswith('data:') and src not in unique_urls: + unique_urls.add(src) + image_variants.append({**base_info, 'src': src, 'width': width}) + + # Process all sources + add_variant(src) + add_variant(data_src) + + # Handle srcset and data-srcset in one pass + for attr in ('srcset', 'data-srcset'): + if value := img.get(attr): + for source in parse_srcset(value): + add_variant(source['url'], source['width']) + + # Quick picture element check + if picture := img.find_parent('picture'): + for source in picture.find_all('source'): + if srcset := source.get('srcset'): + for src in parse_srcset(srcset): + add_variant(src['url'], src['width']) + + # Framework-specific attributes in one pass + for attr, value in img.attrs.items(): + if attr.startswith('data-') and ('src' in attr or 'srcset' in attr) and 'http' in value: + add_variant(value) + + return image_variants if image_variants else None + + + def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]: + media = {'images': [], 'videos': [], 'audios': []} + internal_links_dict = {} + external_links_dict = {} + self._process_element( + url, + element, + media, + internal_links_dict, + external_links_dict, + **kwargs + ) + return { + 'media': media, + 'internal_links_dict': internal_links_dict, + 'external_links_dict': external_links_dict + } + + def _process_element(self, url, element: PageElement, media: Dict[str, Any], internal_links_dict: Dict[str, Any], external_links_dict: Dict[str, Any], **kwargs) -> bool: + try: + if isinstance(element, NavigableString): + if isinstance(element, Comment): + element.extract() + return False + + # if element.name == 'img': + # process_image(element, url, 0, 1) + # return True + + if element.name in ['script', 'style', 'link', 'meta', 'noscript']: + element.decompose() + return False + + keep_element = False + + exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', []) + exclude_social_media_domains = list(set(exclude_social_media_domains)) + + try: + if element.name == 'a' and element.get('href'): + href = element.get('href', '').strip() + if not href: # Skip empty hrefs + return False + + url_base = url.split('/')[2] + + # Normalize the URL + try: + normalized_href = normalize_url(href, url) + except ValueError as e: + # logging.warning(f"Invalid URL format: {href}, Error: {str(e)}") + return False + + link_data = { + 'href': normalized_href, + 'text': element.get_text().strip(), + 'title': element.get('title', '').strip() + } + + # Check for duplicates and add to appropriate dictionary + is_external = is_external_url(normalized_href, url_base) + if is_external: + if normalized_href not in external_links_dict: + external_links_dict[normalized_href] = link_data + else: + if normalized_href not in internal_links_dict: + internal_links_dict[normalized_href] = link_data + + keep_element = True + + # Handle external link exclusions + if is_external: + if kwargs.get('exclude_external_links', False): + element.decompose() + return False + elif kwargs.get('exclude_social_media_links', False): + if any(domain in normalized_href.lower() for domain in exclude_social_media_domains): + element.decompose() + return False + elif kwargs.get('exclude_domains', []): + if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])): + element.decompose() + return False + + except Exception as e: + raise Exception(f"Error processing links: {str(e)}") + + try: + if element.name == 'img': + potential_sources = ['src', 'data-src', 'srcset' 'data-lazy-src', 'data-original'] + src = element.get('src', '') + while not src and potential_sources: + src = element.get(potential_sources.pop(0), '') + if not src: + element.decompose() + return False + + # If it is srcset pick up the first image + if 'srcset' in element.attrs: + src = element.attrs['srcset'].split(',')[0].split(' ')[0] + + # Check flag if we should remove external images + if kwargs.get('exclude_external_images', False): + src_url_base = src.split('/')[2] + url_base = url.split('/')[2] + if url_base not in src_url_base: + element.decompose() + return False + + if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', False): + src_url_base = src.split('/')[2] + url_base = url.split('/')[2] + if any(domain in src for domain in exclude_social_media_domains): + element.decompose() + return False + + # Handle exclude domains + if kwargs.get('exclude_domains', []): + if any(domain in src for domain in kwargs.get('exclude_domains', [])): + element.decompose() + return False + + return True # Always keep image elements + except Exception as e: + raise "Error processing images" + + + # Check if flag to remove all forms is set + if kwargs.get('remove_forms', False) and element.name == 'form': + element.decompose() + return False + + if element.name in ['video', 'audio']: + media[f"{element.name}s"].append({ + 'src': element.get('src'), + 'alt': element.get('alt'), + 'type': element.name, + 'description': self.find_closest_parent_with_useful_text(element, **kwargs) + }) + source_tags = element.find_all('source') + for source_tag in source_tags: + media[f"{element.name}s"].append({ + 'src': source_tag.get('src'), + 'alt': element.get('alt'), + 'type': element.name, + 'description': self.find_closest_parent_with_useful_text(element, **kwargs) + }) + return True # Always keep video and audio elements + + if element.name in ONLY_TEXT_ELIGIBLE_TAGS: + if kwargs.get('only_text', False): + element.replace_with(element.get_text()) + + try: + self.remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False)) + except Exception as e: + # print('Error removing unwanted attributes:', str(e)) + self._log('error', + message="Error removing unwanted attributes: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) + # Process children + for child in list(element.children): + if isinstance(child, NavigableString) and not isinstance(child, Comment): + if len(child.strip()) > 0: + keep_element = True + else: + if self._process_element(url, child, media, internal_links_dict, external_links_dict, **kwargs): + keep_element = True + + + # Check word count + word_count_threshold = kwargs.get('word_count_threshold', MIN_WORD_THRESHOLD) + if not keep_element: + word_count = len(element.get_text(strip=True).split()) + keep_element = word_count >= word_count_threshold + + if not keep_element: + element.decompose() + + return keep_element + except Exception as e: + # print('Error processing element:', str(e)) + self._log('error', + message="Error processing element: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) + return False def _get_content_of_website_optimized(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]: success = True if not html: return None - # soup = BeautifulSoup(html, 'html.parser') soup = BeautifulSoup(html, 'lxml') body = soup.body @@ -200,15 +499,24 @@ class WebScrapingStrategy(ContentScrapingStrategy): tag="SCRAPE", params={"error": str(e)} ) - # print('Error extracting metadata:', str(e)) meta = {} + # Handle tag-based removal first - faster than CSS selection + excluded_tags = set(kwargs.get('excluded_tags', []) or []) + if excluded_tags: + for element in body.find_all(lambda tag: tag.name in excluded_tags): + element.extract() - image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD) - - for tag in kwargs.get('excluded_tags', []) or []: - for el in body.select(tag): - el.decompose() + # Handle CSS selector-based removal + excluded_selector = kwargs.get('excluded_selector', '') + if excluded_selector: + is_single_selector = ',' not in excluded_selector and ' ' not in excluded_selector + if is_single_selector: + while element := body.select_one(excluded_selector): + element.extract() + else: + for element in body.select(excluded_selector): + element.extract() if css_selector: selected_elements = body.select(css_selector) @@ -227,384 +535,17 @@ class WebScrapingStrategy(ContentScrapingStrategy): for el in selected_elements: body.append(el) - links = {'internal': [], 'external': []} - media = {'images': [], 'videos': [], 'audios': []} - internal_links_dict = {} - external_links_dict = {} - - # Extract meaningful text for media files from closest parent - def find_closest_parent_with_useful_text(tag): - current_tag = tag - while current_tag: - current_tag = current_tag.parent - # Get the text content of the parent tag - if current_tag: - text_content = current_tag.get_text(separator=' ',strip=True) - # Check if the text content has at least word_count_threshold - if len(text_content.split()) >= image_description_min_word_threshold: - return text_content - return None - - def process_image_old(img, url, index, total_images): - - - #Check if an image has valid display and inside undesired html elements - def is_valid_image(img, parent, parent_classes): - style = img.get('style', '') - src = img.get('src', '') - classes_to_check = ['button', 'icon', 'logo'] - tags_to_check = ['button', 'input'] - return all([ - 'display:none' not in style, - src, - not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check), - parent.name not in tags_to_check - ]) - - #Score an image for it's usefulness - def score_image_for_usefulness(img, base_url, index, images_count): - image_height = img.get('height') - height_value, height_unit = parse_dimension(image_height) - image_width = img.get('width') - width_value, width_unit = parse_dimension(image_width) - image_size = 0 #int(fetch_image_file_size(img,base_url) or 0) - image_src = img.get('src','') - if "data:image/" in image_src: - image_format = image_src.split(',')[0].split(';')[0].split('/')[1] - else: - image_format = os.path.splitext(img.get('src',''))[1].lower() - # Remove . from format - image_format = image_format.strip('.').split('?')[0] - score = 0 - if height_value: - if height_unit == 'px' and height_value > 150: - score += 1 - if height_unit in ['%','vh','vmin','vmax'] and height_value >30: - score += 1 - if width_value: - if width_unit == 'px' and width_value > 150: - score += 1 - if width_unit in ['%','vh','vmin','vmax'] and width_value >30: - score += 1 - if image_size > 10000: - score += 1 - if img.get('alt') != '': - score+=1 - if any(image_format==format for format in ['jpg','png','webp']): - score+=1 - if index/images_count<0.5: - score+=1 - return score - - if not is_valid_image(img, img.parent, img.parent.get('class', [])): - return None - - score = score_image_for_usefulness(img, url, index, total_images) - if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD): - return None - - base_result = { - 'src': img.get('src', ''), - 'data-src': img.get('data-src', ''), - 'alt': img.get('alt', ''), - 'desc': find_closest_parent_with_useful_text(img), - 'score': score, - 'type': 'image' - } - - sources = [] - srcset = img.get('srcset', '') - if srcset: - sources = parse_srcset(srcset) - if sources: - return [dict(base_result, src=source['url'], width=source['width']) - for source in sources] - - return [base_result] # Always return a list - - def process_image(img, url, index, total_images): - parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w') - if ' ' in u else None} - for u in [f"http{p}" for p in s.split("http") if p]] - - # Constants for checks - classes_to_check = frozenset(['button', 'icon', 'logo']) - tags_to_check = frozenset(['button', 'input']) - - # Pre-fetch commonly used attributes - style = img.get('style', '') - alt = img.get('alt', '') - src = img.get('src', '') - data_src = img.get('data-src', '') - width = img.get('width') - height = img.get('height') - parent = img.parent - parent_classes = parent.get('class', []) - - # Quick validation checks - if ('display:none' in style or - parent.name in tags_to_check or - any(c in cls for c in parent_classes for cls in classes_to_check) or - any(c in src for c in classes_to_check) or - any(c in alt for c in classes_to_check)): - return None - - # Quick score calculation - score = 0 - if width and width.isdigit(): - width_val = int(width) - score += 1 if width_val > 150 else 0 - if height and height.isdigit(): - height_val = int(height) - score += 1 if height_val > 150 else 0 - if alt: - score += 1 - score += index/total_images < 0.5 - - image_format = '' - if "data:image/" in src: - image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0] - else: - image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0] - - if image_format in ('jpg', 'png', 'webp', 'avif'): - score += 1 - - if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD): - return None - - # Use set for deduplication - unique_urls = set() - image_variants = [] - - # Generate a unique group ID for this set of variants - group_id = index - - # Base image info template - base_info = { - 'alt': alt, - 'desc': find_closest_parent_with_useful_text(img), - 'score': score, - 'type': 'image', - 'group_id': group_id # Group ID for this set of variants - } - - # Inline function for adding variants - def add_variant(src, width=None): - if src and not src.startswith('data:') and src not in unique_urls: - unique_urls.add(src) - image_variants.append({**base_info, 'src': src, 'width': width}) - - # Process all sources - add_variant(src) - add_variant(data_src) - - # Handle srcset and data-srcset in one pass - for attr in ('srcset', 'data-srcset'): - if value := img.get(attr): - for source in parse_srcset(value): - add_variant(source['url'], source['width']) - - # Quick picture element check - if picture := img.find_parent('picture'): - for source in picture.find_all('source'): - if srcset := source.get('srcset'): - for src in parse_srcset(srcset): - add_variant(src['url'], src['width']) - - # Framework-specific attributes in one pass - for attr, value in img.attrs.items(): - if attr.startswith('data-') and ('src' in attr or 'srcset' in attr) and 'http' in value: - add_variant(value) - - return image_variants if image_variants else None - - def remove_unwanted_attributes(element, important_attrs, keep_data_attributes=False): - attrs_to_remove = [] - for attr in element.attrs: - if attr not in important_attrs: - if keep_data_attributes: - if not attr.startswith('data-'): - attrs_to_remove.append(attr) - else: - attrs_to_remove.append(attr) - - for attr in attrs_to_remove: - del element[attr] + result_obj = self.process_element( + url, + body, + word_count_threshold = word_count_threshold, + **kwargs + ) - def process_element(element: element.PageElement) -> bool: - try: - if isinstance(element, NavigableString): - if isinstance(element, Comment): - element.extract() - return False - - # if element.name == 'img': - # process_image(element, url, 0, 1) - # return True - - if element.name in ['script', 'style', 'link', 'meta', 'noscript']: - element.decompose() - return False - - keep_element = False - - exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', []) - exclude_social_media_domains = list(set(exclude_social_media_domains)) - - try: - if element.name == 'a' and element.get('href'): - href = element.get('href', '').strip() - if not href: # Skip empty hrefs - return False - - url_base = url.split('/')[2] - - # Normalize the URL - try: - normalized_href = normalize_url(href, url) - except ValueError as e: - # logging.warning(f"Invalid URL format: {href}, Error: {str(e)}") - return False - - link_data = { - 'href': normalized_href, - 'text': element.get_text().strip(), - 'title': element.get('title', '').strip() - } - - # Check for duplicates and add to appropriate dictionary - is_external = is_external_url(normalized_href, url_base) - if is_external: - if normalized_href not in external_links_dict: - external_links_dict[normalized_href] = link_data - else: - if normalized_href not in internal_links_dict: - internal_links_dict[normalized_href] = link_data - - keep_element = True - - # Handle external link exclusions - if is_external: - if kwargs.get('exclude_external_links', False): - element.decompose() - return False - elif kwargs.get('exclude_social_media_links', False): - if any(domain in normalized_href.lower() for domain in exclude_social_media_domains): - element.decompose() - return False - elif kwargs.get('exclude_domains', []): - if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])): - element.decompose() - return False - - except Exception as e: - raise Exception(f"Error processing links: {str(e)}") - - try: - if element.name == 'img': - potential_sources = ['src', 'data-src', 'srcset' 'data-lazy-src', 'data-original'] - src = element.get('src', '') - while not src and potential_sources: - src = element.get(potential_sources.pop(0), '') - if not src: - element.decompose() - return False - - # If it is srcset pick up the first image - if 'srcset' in element.attrs: - src = element.attrs['srcset'].split(',')[0].split(' ')[0] - - # Check flag if we should remove external images - if kwargs.get('exclude_external_images', False): - src_url_base = src.split('/')[2] - url_base = url.split('/')[2] - if url_base not in src_url_base: - element.decompose() - return False - - if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', False): - src_url_base = src.split('/')[2] - url_base = url.split('/')[2] - if any(domain in src for domain in exclude_social_media_domains): - element.decompose() - return False - - # Handle exclude domains - if kwargs.get('exclude_domains', []): - if any(domain in src for domain in kwargs.get('exclude_domains', [])): - element.decompose() - return False - - return True # Always keep image elements - except Exception as e: - raise "Error processing images" - - - # Check if flag to remove all forms is set - if kwargs.get('remove_forms', False) and element.name == 'form': - element.decompose() - return False - - if element.name in ['video', 'audio']: - media[f"{element.name}s"].append({ - 'src': element.get('src'), - 'alt': element.get('alt'), - 'type': element.name, - 'description': find_closest_parent_with_useful_text(element) - }) - source_tags = element.find_all('source') - for source_tag in source_tags: - media[f"{element.name}s"].append({ - 'src': source_tag.get('src'), - 'alt': element.get('alt'), - 'type': element.name, - 'description': find_closest_parent_with_useful_text(element) - }) - return True # Always keep video and audio elements - - if element.name in ONLY_TEXT_ELIGIBLE_TAGS: - if kwargs.get('only_text', False): - element.replace_with(element.get_text()) - - try: - remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False)) - except Exception as e: - # print('Error removing unwanted attributes:', str(e)) - self._log('error', - message="Error removing unwanted attributes: {error}", - tag="SCRAPE", - params={"error": str(e)} - ) - # Process children - for child in list(element.children): - if isinstance(child, NavigableString) and not isinstance(child, Comment): - if len(child.strip()) > 0: - keep_element = True - else: - if process_element(child): - keep_element = True - - - # Check word count - if not keep_element: - word_count = len(element.get_text(strip=True).split()) - keep_element = word_count >= word_count_threshold - - if not keep_element: - element.decompose() - - return keep_element - except Exception as e: - # print('Error processing element:', str(e)) - self._log('error', - message="Error processing element: {error}", - tag="SCRAPE", - params={"error": str(e)} - ) - return False - - process_element(body) + links = {'internal': [], 'external': []} + media = result_obj['media'] + internal_links_dict = result_obj['internal_links_dict'] + external_links_dict = result_obj['external_links_dict'] # Update the links dictionary with unique links links['internal'] = list(internal_links_dict.values()) @@ -613,23 +554,14 @@ class WebScrapingStrategy(ContentScrapingStrategy): # # Process images using ThreadPoolExecutor imgs = body.find_all('img') - # For test we use for loop instead of thread media['images'] = [ - img for result in (process_image(img, url, i, len(imgs)) + img for result in (self.process_image(img, url, i, len(imgs)) for i, img in enumerate(imgs)) if result is not None for img in result ] - def flatten_nested_elements(node): - if isinstance(node, NavigableString): - return node - if len(node.contents) == 1 and isinstance(node.contents[0], element.Tag) and node.contents[0].name == node.name: - return flatten_nested_elements(node.contents[0]) - node.contents = [flatten_nested_elements(child) for child in node.contents] - return node - - body = flatten_nested_elements(body) + body = self.flatten_nested_elements(body) base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)') for img in imgs: src = img.get('src', '') diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 253ec079..0a9e6f56 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -22,7 +22,7 @@ import textwrap from .html2text import HTML2Text class CustomHTML2Text(HTML2Text): - def __init__(self, *args, **kwargs): + def __init__(self, *args, handle_code_in_pre=False, **kwargs): super().__init__(*args, **kwargs) self.inside_pre = False self.inside_code = False @@ -30,6 +30,7 @@ class CustomHTML2Text(HTML2Text): self.current_preserved_tag = None self.preserved_content = [] self.preserve_depth = 0 + self.handle_code_in_pre = handle_code_in_pre # Configuration options self.skip_internal_links = False @@ -50,6 +51,8 @@ class CustomHTML2Text(HTML2Text): for key, value in kwargs.items(): if key == 'preserve_tags': self.preserve_tags = set(value) + elif key == 'handle_code_in_pre': + self.handle_code_in_pre = value else: setattr(self, key, value) @@ -88,13 +91,21 @@ class CustomHTML2Text(HTML2Text): # Handle pre tags if tag == 'pre': if start: - self.o('```\n') + self.o('```\n') # Markdown code block start self.inside_pre = True else: - self.o('\n```') + self.o('\n```\n') # Markdown code block end self.inside_pre = False - # elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: - # pass + elif tag == 'code': + if self.inside_pre and not self.handle_code_in_pre: + # Ignore code tags inside pre blocks if handle_code_in_pre is False + return + if start: + self.o('`') # Markdown inline code start + self.inside_code = True + else: + self.o('`') # Markdown inline code end + self.inside_code = False else: super().handle_tag(tag, attrs, start) @@ -103,7 +114,39 @@ class CustomHTML2Text(HTML2Text): if self.preserve_depth > 0: self.preserved_content.append(data) return + + if self.inside_pre: + # Output the raw content for pre blocks, including content inside code tags + self.o(data) # Directly output the data as-is (preserve newlines) + return + if self.inside_code: + # Inline code: no newlines allowed + self.o(data.replace('\n', ' ')) + return + + # Default behavior for other tags super().handle_data(data, entity_char) + + + # # Handle pre tags + # if tag == 'pre': + # if start: + # self.o('```\n') + # self.inside_pre = True + # else: + # self.o('\n```') + # self.inside_pre = False + # # elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: + # # pass + # else: + # super().handle_tag(tag, attrs, start) + + # def handle_data(self, data, entity_char=False): + # """Override handle_data to capture content within preserved tags.""" + # if self.preserve_depth > 0: + # self.preserved_content.append(data) + # return + # super().handle_data(data, entity_char) class InvalidCSSSelectorError(Exception): pass diff --git a/crawl4ai/utils.scraping.py b/crawl4ai/utils.scraping.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 176b0ba7..9d97dabd 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -547,6 +547,7 @@ async def generate_knowledge_graph(): f.write(result.extracted_content) async def fit_markdown_remove_overlay(): + async with AsyncWebCrawler( headless=True, # Set to False to see what is happening verbose=True, @@ -560,13 +561,15 @@ async def fit_markdown_remove_overlay(): url='https://www.kidocode.com/degrees/technology', cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator( - content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0), + content_filter=PruningContentFilter( + threshold=0.48, threshold_type="fixed", min_word_threshold=0 + ), options={ "ignore_links": True } ), # markdown_generator=DefaultMarkdownGenerator( - # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0), + # content_filter=BM25ContentFilter(user_query="", bm25_threshold=1.0), # options={ # "ignore_links": True # } From c51e901f68593c8a40788f23969873fccdfc6432 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 8 Dec 2024 20:04:44 +0800 Subject: [PATCH 54/70] feat: Enhance AsyncPlaywrightCrawlerStrategy with text-only and light modes, dynamic viewport adjustment, and session management ### New Features: - **Text-Only Mode**: Added support for text-only crawling by disabling images, JavaScript, GPU, and other non-essential features. - **Light Mode**: Optimized browser settings to reduce resource usage and improve efficiency during crawling. - **Dynamic Viewport Adjustment**: Automatically adjusts viewport dimensions based on content size, ensuring accurate rendering and scaling. - **Full Page Scanning**: Introduced a feature to scroll and capture dynamic content for pages with infinite scroll or lazy-loading elements. - **Session Management**: Added `create_session` method for creating and managing browser sessions with unique IDs. ### Improvements: - Unified viewport handling across contexts by dynamically setting dimensions using `self.viewport_width` and `self.viewport_height`. - Enhanced logging and error handling for viewport adjustments, page scanning, and content evaluation. - Reduced resource usage with additional browser flags for both `light_mode` and `text_only` configurations. - Improved handling of cookies, headers, and proxies in session creation. ### Refactoring: - Removed hardcoded viewport dimensions and replaced them with dynamic configurations. - Cleaned up unused and commented-out code for better readability and maintainability. - Introduced defaults for frequently used parameters like `delay_before_return_html`. ### Fixes: - Resolved potential inconsistencies in viewport handling. - Improved robustness of content loading and dynamic adjustments to avoid failures and timeouts. ### Docs Update: - Updated schema usage in `quickstart_async.py` example: - Changed `OpenAIModelFee.schema()` to `OpenAIModelFee.model_json_schema()` for compatibility. - Enhanced LLM extraction instruction documentation. This commit introduces significant enhancements to improve efficiency, flexibility, and reliability of the crawler strategy. --- CHANGELOG.md | 86 ++++++++++++ README.md | 23 ++-- crawl4ai/__version__.py | 2 +- crawl4ai/async_crawler_strategy.py | 214 ++++++++++++++++++++++++----- docs/examples/quickstart_async.py | 2 +- docs/md_v2/blog/index.md | 15 +- docs/md_v2/blog/releases/0.4.1.md | 145 +++++++++++++++++++ mkdocs.yml | 2 +- 8 files changed, 440 insertions(+), 49 deletions(-) create mode 100644 docs/md_v2/blog/releases/0.4.1.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 03a7afb0..58dacf81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,91 @@ # Changelog +## [0.4.1] December 8, 2024 + +### **File: `crawl4ai/async_crawler_strategy.py`** + +#### **New Parameters and Attributes Added** +- **`text_only` (boolean)**: Enables text-only mode, disables images, JavaScript, and GPU-related features for faster, minimal rendering. +- **`light_mode` (boolean)**: Optimizes the browser by disabling unnecessary background processes and features for efficiency. +- **`viewport_width` and `viewport_height`**: Dynamically adjusts based on `text_only` mode (default values: 800x600 for `text_only`, 1920x1080 otherwise). +- **`extra_args`**: Adds browser-specific flags for `text_only` mode. +- **`adjust_viewport_to_content`**: Dynamically adjusts the viewport to the content size for accurate rendering. + +#### **Browser Context Adjustments** +- Added **`viewport` adjustments**: Dynamically computed based on `text_only` or custom configuration. +- Enhanced support for `light_mode` and `text_only` by adding specific browser arguments to reduce resource consumption. + +#### **Dynamic Content Handling** +- **Full Page Scan Feature**: + - Scrolls through the entire page while dynamically detecting content changes. + - Ensures scrolling stops when no new dynamic content is loaded. + +#### **Session Management** +- Added **`create_session`** method: + - Creates a new browser session and assigns a unique ID. + - Supports persistent and non-persistent contexts with full compatibility for cookies, headers, and proxies. + +#### **Improved Content Loading and Adjustment** +- **`adjust_viewport_to_content`**: + - Automatically adjusts viewport to match content dimensions. + - Includes scaling via Chrome DevTools Protocol (CDP). +- Enhanced content loading: + - Waits for images to load and ensures network activity is idle before proceeding. + +#### **Error Handling and Logging** +- Improved error handling and detailed logging for: + - Viewport adjustment (`adjust_viewport_to_content`). + - Full page scanning (`scan_full_page`). + - Dynamic content loading. + +#### **Refactoring and Cleanup** +- Removed hardcoded viewport dimensions in multiple places, replaced with dynamic values (`self.viewport_width`, `self.viewport_height`). +- Removed commented-out and unused code for better readability. +- Added default value for `delay_before_return_html` parameter. + +#### **Optimizations** +- Reduced resource usage in `light_mode` by disabling unnecessary browser features such as extensions, background timers, and sync. +- Improved compatibility for different browser types (`chrome`, `firefox`, `webkit`). + +--- + +### **File: `docs/examples/quickstart_async.py`** + +#### **Schema Adjustment** +- Changed schema reference for `LLMExtractionStrategy`: + - **Old**: `OpenAIModelFee.schema()` + - **New**: `OpenAIModelFee.model_json_schema()` + - This likely ensures better compatibility with the `OpenAIModelFee` class and its JSON schema. + +#### **Documentation Comments Updated** +- Improved extraction instruction for schema-based LLM strategies. + +--- + +### **New Features Added** +1. **Text-Only Mode**: + - Focuses on minimal resource usage by disabling non-essential browser features. +2. **Light Mode**: + - Optimizes browser for performance by disabling background tasks and unnecessary services. +3. **Full Page Scanning**: + - Ensures the entire content of a page is crawled, including dynamic elements loaded during scrolling. +4. **Dynamic Viewport Adjustment**: + - Automatically resizes the viewport to match content dimensions, improving compatibility and rendering accuracy. +5. **Session Management**: + - Simplifies session handling with better support for persistent and non-persistent contexts. + +--- + +### **Bug Fixes** +- Fixed potential viewport mismatches by ensuring consistent use of `self.viewport_width` and `self.viewport_height` throughout the code. +- Improved robustness of dynamic content loading to avoid timeouts and failed evaluations. + + + + + + + ## [0.3.75] December 1, 2024 ### PruningContentFilter diff --git a/README.md b/README.md index cbeb4067..dede4a03 100644 --- a/README.md +++ b/README.md @@ -11,10 +11,9 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. +[✨ Check out latest update v0.4.1](#-recent-updates) -🎉 **Version 0.4.0 is out!** Introducing our experimental PruningContentFilter - a powerful new algorithm for smarter Markdown generation. Test it out and [share your feedback](https://github.com/unclecode/crawl4ai/issues)! [Read the release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/blog/releases/0.4.0.md) - -[✨ Check out latest update v0.4.0](#-recent-updates) +🎉 **Version 0.4.x is out!** Introducing our experimental PruningContentFilter - a powerful new algorithm for smarter Markdown generation. Test it out and [share your feedback](https://github.com/unclecode/crawl4ai/issues)! [Read the release notes →](https://crawl4ai.com/mkdocs/blog) ## 🧐 Why Crawl4AI? @@ -80,6 +79,7 @@ if __name__ == "__main__": - 🧩 **Proxy Support**: Seamlessly connect to proxies with authentication for secure access. - ⚙️ **Full Browser Control**: Modify headers, cookies, user agents, and more for tailored crawling setups. - 🌍 **Multi-Browser Support**: Compatible with Chromium, Firefox, and WebKit. +- 📐 **Dynamic Viewport Adjustment**: Automatically adjusts the browser viewport to match page content, ensuring complete rendering and capturing of all elements.
@@ -95,6 +95,8 @@ if __name__ == "__main__": - 💾 **Caching**: Cache data for improved speed and to avoid redundant fetches. - 📄 **Metadata Extraction**: Retrieve structured metadata from web pages. - 📡 **IFrame Content Extraction**: Seamless extraction from embedded iframe content. +- 🕵️ **Lazy Load Handling**: Waits for images to fully load, ensuring no content is missed due to lazy loading. +- 🔄 **Full-Page Scanning**: Simulates scrolling to load and capture all dynamic content, perfect for infinite scroll pages.
@@ -121,8 +123,6 @@ if __name__ == "__main__":
- - ## Try it Now! ✨ Play around with this [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing) @@ -626,13 +626,14 @@ async def test_news_crawl(): ## ✨ Recent Updates -- 🔬 **PruningContentFilter**: New unsupervised filtering strategy for intelligent content extraction based on text density and relevance scoring. -- 🧵 **Enhanced Thread Safety**: Improved multi-threaded environment handling with better locks and parallel processing support. -- 🤖 **Smart User-Agent Generation**: Advanced user-agent generator with customization options and randomization capabilities. -- 📝 **New Blog Launch**: Stay updated with our detailed release notes and technical deep dives at [crawl4ai.com/blog](https://crawl4ai.com/blog). -- 🧪 **Expanded Test Coverage**: Comprehensive test suite for both PruningContentFilter and BM25ContentFilter with edge case handling. +- 🖼️ **Lazy Load Handling**: Improved support for websites with lazy-loaded images. The crawler now waits for all images to fully load, ensuring no content is missed. +- ⚡ **Text-Only Mode**: New mode for fast, lightweight crawling. Disables images, JavaScript, and GPU rendering, improving speed by 3-4x for text-focused crawls. +- 📐 **Dynamic Viewport Adjustment**: Automatically adjusts the browser viewport to fit page content, ensuring accurate rendering and capturing of all elements. +- 🔄 **Full-Page Scanning**: Added scrolling support for pages with infinite scroll or dynamic content loading. Ensures every part of the page is captured. +- 🧑‍💻 **Session Reuse**: Introduced `create_session` for efficient crawling by reusing the same browser session across multiple requests. +- 🌟 **Light Mode**: Optimized browser performance by disabling unnecessary features like extensions, background timers, and sync processes. -Read the full details of this release in our [0.4.0 Release Notes](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/blog/releases/0.4.0.md). +Read the full details of this release in our [0.4.1 Release Notes](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/blog/releases/0.4.1.md). ## 📖 Documentation & Roadmap diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 6f8b06f4..80861132 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.0" +__version__ = "0.4.1" diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 493597ea..5c706239 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -220,8 +220,22 @@ class AsyncCrawlerStrategy(ABC): class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): def __init__(self, use_cached_html=False, js_code=None, logger = None, **kwargs): + self.text_only = kwargs.get("text_only", False) + self.light_mode = kwargs.get("light_mode", False) self.logger = logger self.use_cached_html = use_cached_html + self.viewport_width = kwargs.get("viewport_width", 800 if self.text_only else 1920) + self.viewport_height = kwargs.get("viewport_height", 600 if self.text_only else 1080) + + if self.text_only: + self.extra_args = kwargs.get("extra_args", []) + [ + '--disable-images', + '--disable-javascript', + '--disable-gpu', + '--disable-software-rasterizer', + '--disable-dev-shm-usage' + ] + self.user_agent = kwargs.get( "user_agent", # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" @@ -300,7 +314,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): else: # If no default context exists, create one self.default_context = await self.browser.new_context( - viewport={"width": 1920, "height": 1080} + # viewport={"width": 1920, "height": 1080} + viewport={"width": self.viewport_width, "height": self.viewport_height} ) # Set up the default context @@ -334,10 +349,40 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "--ignore-certificate-errors", "--ignore-certificate-errors-spki-list", "--disable-blink-features=AutomationControlled", - + "--window-position=400,0", + f"--window-size={self.viewport_width},{self.viewport_height}", ] } + if self.light_mode: + browser_args["args"].extend([ + # "--disable-background-networking", + "--disable-background-timer-throttling", + "--disable-backgrounding-occluded-windows", + "--disable-breakpad", + "--disable-client-side-phishing-detection", + "--disable-component-extensions-with-background-pages", + "--disable-default-apps", + "--disable-extensions", + "--disable-features=TranslateUI", + "--disable-hang-monitor", + "--disable-ipc-flooding-protection", + "--disable-popup-blocking", + "--disable-prompt-on-repost", + "--disable-sync", + "--force-color-profile=srgb", + "--metrics-recording-only", + "--no-first-run", + "--password-store=basic", + "--use-mock-keychain" + ]) + + if self.text_only: + browser_args["args"].extend([ + '--blink-settings=imagesEnabled=false', + '--disable-remote-fonts' + ]) + # Add channel if specified (try Chrome first) if self.chrome_channel: browser_args["channel"] = self.chrome_channel @@ -367,6 +412,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.browser_type == "firefox": self.browser = await self.playwright.firefox.launch(**browser_args) elif self.browser_type == "webkit": + if "viewport" not in browser_args: + browser_args["viewport"] = {"width": self.viewport_width, "height": self.viewport_height} self.browser = await self.playwright.webkit.launch(**browser_args) else: if self.use_persistent_context and self.user_data_dir: @@ -576,6 +623,38 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Return the page object return page + async def create_session(self, **kwargs) -> str: + """Creates a new browser session and returns its ID.""" + if not self.browser: + await self.start() + + session_id = kwargs.get('session_id') or str(uuid.uuid4()) + + if self.use_managed_browser: + page = await self.default_context.new_page() + self.sessions[session_id] = (self.default_context, page, time.time()) + else: + if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: + context = self.browser + page = await context.new_page() + else: + context = await self.browser.new_context( + user_agent=kwargs.get("user_agent", self.user_agent), + viewport={"width": self.viewport_width, "height": self.viewport_height}, + proxy={"server": self.proxy} if self.proxy else None, + accept_downloads=self.accept_downloads, + ignore_https_errors=True + ) + + if self.cookies: + await context.add_cookies(self.cookies) + await context.set_extra_http_headers(self.headers) + page = await context.new_page() + + self.sessions[session_id] = (context, page, time.time()) + + return session_id + async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: """ Crawls a given URL or processes raw HTML/local file content based on the URL prefix. @@ -684,12 +763,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: # In persistent context, browser is the context context = self.browser - page = await context.new_page() else: # Normal context creation for non-persistent or non-Chrome browsers context = await self.browser.new_context( user_agent=user_agent, - viewport={"width": 1200, "height": 800}, + viewport={"width": self.viewport_width, "height": self.viewport_height}, proxy={"server": self.proxy} if self.proxy else None, java_script_enabled=True, accept_downloads=self.accept_downloads, @@ -699,7 +777,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.cookies: await context.add_cookies(self.cookies) await context.set_extra_http_headers(self.headers) - page = await context.new_page() + + page = await context.new_page() self.sessions[session_id] = (context, page, time.time()) else: if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: @@ -709,7 +788,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Normal context creation context = await self.browser.new_context( user_agent=user_agent, - viewport={"width": 1920, "height": 1080}, + # viewport={"width": 1920, "height": 1080}, + viewport={"width": self.viewport_width, "height": self.viewport_height}, proxy={"server": self.proxy} if self.proxy else None, accept_downloads=self.accept_downloads, ignore_https_errors=True # Add this line @@ -763,9 +843,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.accept_downloads: page.on("download", lambda download: asyncio.create_task(self._handle_download(download))) - # if self.verbose: - # print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...") - if self.use_cached_html: cache_file_path = os.path.join( os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() @@ -786,7 +863,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if not kwargs.get("js_only", False): await self.execute_hook('before_goto', page, context = context) - try: response = await page.goto( @@ -798,9 +874,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Error as e: raise RuntimeError(f"Failed on navigating ACS-GOTO :\n{str(e)}") - # response = await page.goto("about:blank") - # await page.evaluate(f"window.location.href = '{url}'") - await self.execute_hook('after_goto', page, context = context) # Get status code and headers @@ -853,7 +926,83 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): else: raise Error(f"Body element is hidden: {visibility_info}") - await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + # CONTENT LOADING ASSURANCE + if not self.text_only and (kwargs.get("wait_for_images", True) or kwargs.get("adjust_viewport_to_content", False)): + # Wait for network idle after initial load and images to load + await page.wait_for_load_state("networkidle") + await asyncio.sleep(0.1) + await page.wait_for_function("Array.from(document.images).every(img => img.complete)") + + # After initial load, adjust viewport to content size + if not self.text_only and kwargs.get("adjust_viewport_to_content", False): + try: + # Get actual page dimensions + page_width = await page.evaluate("document.documentElement.scrollWidth") + page_height = await page.evaluate("document.documentElement.scrollHeight") + + target_width = self.viewport_width + target_height = int(target_width * page_width / page_height * 0.95) + await page.set_viewport_size({"width": target_width, "height": target_height}) + + # Compute scale factor + # We want the entire page visible: the scale should make both width and height fit + scale = min(target_width / page_width, target_height / page_height) + + # Now we call CDP to set metrics. + # We tell Chrome that the "device" is page_width x page_height in size, + # but we scale it down so everything fits within the real viewport. + cdp = await page.context.new_cdp_session(page) + await cdp.send('Emulation.setDeviceMetricsOverride', { + 'width': page_width, # full page width + 'height': page_height, # full page height + 'deviceScaleFactor': 1, # keep normal DPR + 'mobile': False, + 'scale': scale # scale the entire rendered content + }) + + except Exception as e: + self.logger.warning( + message="Failed to adjust viewport to content: {error}", + tag="VIEWPORT", + params={"error": str(e)} + ) + + # After viewport adjustment, handle page scanning if requested + if kwargs.get("scan_full_page", False): + try: + viewport_height = page.viewport_size.get("height", self.viewport_height) + current_position = viewport_height # Start with one viewport height + scroll_delay = kwargs.get("scroll_delay", 0.2) + + # Initial scroll + await page.evaluate(f"window.scrollTo(0, {current_position})") + await asyncio.sleep(scroll_delay) + + # Get height after first scroll to account for any dynamic content + total_height = await page.evaluate("document.documentElement.scrollHeight") + + while current_position < total_height: + current_position = min(current_position + viewport_height, total_height) + await page.evaluate(f"window.scrollTo(0, {current_position})") + await asyncio.sleep(scroll_delay) + + # Check for dynamic content + new_height = await page.evaluate("document.documentElement.scrollHeight") + if new_height > total_height: + total_height = new_height + + # Scroll back to top + await page.evaluate("window.scrollTo(0, 0)") + + except Exception as e: + self.logger.warning( + message="Failed to perform full page scan: {error}", + tag="PAGE_SCAN", + params={"error": str(e)} + ) + else: + # Scroll to the bottom of the page + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") js_code = kwargs.get("js_code", kwargs.get("js", self.js_code)) if js_code: @@ -887,7 +1036,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # await page.wait_for_load_state('networkidle', timeout=5000) # Update image dimensions - update_image_dimensions_js = """ + if not self.text_only: + update_image_dimensions_js = """ () => { return new Promise((resolve) => { const filterImage = (img) => { @@ -944,26 +1094,26 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): } """ - try: try: - await page.wait_for_load_state( - # state="load", - state="domcontentloaded", - timeout=5 + try: + await page.wait_for_load_state( + # state="load", + state="domcontentloaded", + timeout=5 + ) + except PlaywrightTimeoutError: + pass + await page.evaluate(update_image_dimensions_js) + except Exception as e: + self.logger.error( + message="Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {error}", + tag="ERROR", + params={"error": str(e)} ) - except PlaywrightTimeoutError: - pass - await page.evaluate(update_image_dimensions_js) - except Exception as e: - self.logger.error( - message="Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {error}", - tag="ERROR", - params={"error": str(e)} - ) - # raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}") + # raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}") # Wait a bit for any onload events to complete - await page.wait_for_timeout(100) + # await page.wait_for_timeout(100) # Process iframes if kwargs.get("process_iframes", False): @@ -971,7 +1121,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.execute_hook('before_retrieve_html', page, context = context) # Check if delay_before_return_html is set then wait for that time - delay_before_return_html = kwargs.get("delay_before_return_html") + delay_before_return_html = kwargs.get("delay_before_return_html", 0.1) if delay_before_return_html: await asyncio.sleep(delay_before_return_html) diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 9d97dabd..ac844ed5 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -128,7 +128,7 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None extraction_strategy=LLMExtractionStrategy( provider=provider, api_token=api_token, - schema=OpenAIModelFee.schema(), + schema=OpenAIModelFee.model_json_schema(), extraction_type="schema", instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. Do not miss any models in the entire content. One extracted model JSON format should look like this: diff --git a/docs/md_v2/blog/index.md b/docs/md_v2/blog/index.md index 054b12f8..28ccfa6b 100644 --- a/docs/md_v2/blog/index.md +++ b/docs/md_v2/blog/index.md @@ -1,19 +1,28 @@ # Crawl4AI Blog -Welcome to the Crawl4AI blog! Here you'll find detailed release notes, technical deep dives, and news about the project. +Welcome to the Crawl4AI blog! Here you'll find detailed release notes, technical insights, and updates about the project. Whether you're looking for the latest improvements or want to dive deep into web crawling techniques, this is the place. ## Latest Release +### [0.4.1 - Smarter Crawling with Lazy-Load Handling, Text-Only Mode, and More](releases/0.4.1.md) +*December 8, 2024* + +This release brings major improvements to handling lazy-loaded images, a blazing-fast Text-Only Mode, full-page scanning for infinite scrolls, dynamic viewport adjustments, and session reuse for efficient crawling. If you're looking to improve speed, reliability, or handle dynamic content with ease, this update has you covered. + +[Read full release notes →](releases/0.4.1.md) + +--- + ### [0.4.0 - Major Content Filtering Update](releases/0.4.0.md) *December 1, 2024* -Introducing significant improvements to content filtering, multi-threaded environment handling, and user-agent generation. This release features the new PruningContentFilter, enhanced thread safety, and improved test coverage. +Introduced significant improvements to content filtering, multi-threaded environment handling, and user-agent generation. This release features the new PruningContentFilter, enhanced thread safety, and improved test coverage. [Read full release notes →](releases/0.4.0.md) ## Project History -Want to see how we got here? Check out our [complete changelog](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md) covering all previous versions and the evolution of Crawl4AI. +Curious about how Crawl4AI has evolved? Check out our [complete changelog](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md) for a detailed history of all versions and updates. ## Categories diff --git a/docs/md_v2/blog/releases/0.4.1.md b/docs/md_v2/blog/releases/0.4.1.md new file mode 100644 index 00000000..b02b758d --- /dev/null +++ b/docs/md_v2/blog/releases/0.4.1.md @@ -0,0 +1,145 @@ +# Release Summary for Version 0.4.1 (December 8, 2024): Major Efficiency Boosts with New Features! + +_This post was generated with the help of ChatGPT, take everything with a grain of salt. 🧂_ + +Hi everyone, + +I just finished putting together version 0.4.1 of Crawl4AI, and there are a few changes in here that I think you’ll find really helpful. I’ll explain what’s new, why it matters, and exactly how you can use these features (with the code to back it up). Let’s get into it. + +--- + +### Handling Lazy Loading Better (Images Included) + +One thing that always bugged me with crawlers is how often they miss lazy-loaded content, especially images. In this version, I made sure Crawl4AI **waits for all images to load** before moving forward. This is useful because many modern websites only load images when they’re in the viewport or after some JavaScript executes. + +Here’s how to enable it: + +```python +await crawler.crawl( + url="https://example.com", + wait_for_images=True # Add this argument to ensure images are fully loaded +) +``` + +What this does is: +1. Waits for the page to reach a "network idle" state. +2. Ensures all images on the page have been completely loaded. + +This single change handles the majority of lazy-loading cases you’re likely to encounter. + +--- + +### Text-Only Mode (Fast, Lightweight Crawling) + +Sometimes, you don’t need to download images or process JavaScript at all. For example, if you’re crawling to extract text data, you can enable **text-only mode** to speed things up. By disabling images, JavaScript, and other heavy resources, this mode makes crawling **3-4 times faster** in most cases. + +Here’s how to turn it on: + +```python +crawler = AsyncPlaywrightCrawlerStrategy( + text_only=True # Set this to True to enable text-only crawling +) +``` + +When `text_only=True`, the crawler automatically: +- Disables GPU processing. +- Blocks image and JavaScript resources. +- Reduces the viewport size to 800x600 (you can override this with `viewport_width` and `viewport_height`). + +If you need to crawl thousands of pages where you only care about text, this mode will save you a ton of time and resources. + +--- + +### Adjusting the Viewport Dynamically + +Another useful addition is the ability to **dynamically adjust the viewport size** to match the content on the page. This is particularly helpful when you’re working with responsive layouts or want to ensure all parts of the page load properly. + +Here’s how it works: +1. The crawler calculates the page’s width and height after it loads. +2. It adjusts the viewport to fit the content dimensions. +3. (Optional) It uses Chrome DevTools Protocol (CDP) to simulate zooming out so everything fits in the viewport. + +To enable this, use: + +```python +await crawler.crawl( + url="https://example.com", + adjust_viewport_to_content=True # Dynamically adjusts the viewport +) +``` + +This approach makes sure the entire page gets loaded into the viewport, especially for layouts that load content based on visibility. + +--- + +### Simulating Full-Page Scrolling + +Some websites load data dynamically as you scroll down the page. To handle these cases, I added support for **full-page scanning**. It simulates scrolling to the bottom of the page, checking for new content, and capturing it all. + +Here’s an example: + +```python +await crawler.crawl( + url="https://example.com", + scan_full_page=True, # Enables scrolling + scroll_delay=0.2 # Waits 200ms between scrolls (optional) +) +``` + +What happens here: +1. The crawler scrolls down in increments, waiting for content to load after each scroll. +2. It stops when no new content appears (i.e., dynamic elements stop loading). +3. It scrolls back to the top before finishing (if necessary). + +If you’ve ever had to deal with infinite scroll pages, this is going to save you a lot of headaches. + +--- + +### Reusing Browser Sessions (Save Time on Setup) + +By default, every time you crawl a page, a new browser context (or tab) is created. That’s fine for small crawls, but if you’re working on a large dataset, it’s more efficient to reuse the same session. + +I added a method called `create_session` for this: + +```python +session_id = await crawler.create_session() + +# Use the same session for multiple crawls +await crawler.crawl( + url="https://example.com/page1", + session_id=session_id # Reuse the session +) +await crawler.crawl( + url="https://example.com/page2", + session_id=session_id +) +``` + +This avoids creating a new tab for every page, speeding up the crawl and reducing memory usage. + +--- + +### Other Updates + +Here are a few smaller updates I’ve made: +- **Light Mode**: Use `light_mode=True` to disable background processes, extensions, and other unnecessary features, making the browser more efficient. +- **Logging**: Improved logs to make debugging easier. +- **Defaults**: Added sensible defaults for things like `delay_before_return_html` (now set to 0.1 seconds). + +--- + +### How to Get the Update + +You can install or upgrade to version `0.4.1` like this: + +```bash +pip install crawl4ai --upgrade +``` + +As always, I’d love to hear your thoughts. If there’s something you think could be improved or if you have suggestions for future versions, let me know! + +Enjoy the new features, and happy crawling! 🕷️ + +--- + + diff --git a/mkdocs.yml b/mkdocs.yml index 4ba7c2a7..6009dddf 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -12,7 +12,7 @@ nav: - 'Quick Start': 'basic/quickstart.md' - Changelog & Blog: - 'Blog Home': 'blog/index.md' - - 'Latest (0.4.0)': 'blog/releases/0.4.0.md' + - 'Latest (0.4.1)': 'blog/releases/0.4.1.md' - 'Changelog': 'https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md' - Basic: From e3488da1945e8c17ee3dc7e501be7187d7f6beae Mon Sep 17 00:00:00 2001 From: Olavo Henrique Marques Peixoto <98776769+olavohenrique03@users.noreply.github.com> Date: Mon, 9 Dec 2024 03:34:52 -0300 Subject: [PATCH 55/70] fixing Readmen tap (#313) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index dede4a03..7407484e 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# 🔥🕷️ Crawl4AI: Crawl Smarter, Faster, Freely. For AI. +# Crawl4AI: Crawl Smarter, Faster, Freely. For AI. unclecode%2Fcrawl4ai | Trendshift From ba3e8088027e67ee8956ff0c54f4ffcc0438ae87 Mon Sep 17 00:00:00 2001 From: lu4nx Date: Mon, 9 Dec 2024 17:19:26 +0800 Subject: [PATCH 56/70] fix: The extract method logs output only when self.verbose is set to True. (#314) Co-authored-by: lu4nx --- crawl4ai/extraction_strategy.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index b79e0c43..b7eabf74 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -92,8 +92,10 @@ class LLMExtractionStrategy(ExtractionStrategy): def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]: - # print("[LOG] Extracting blocks from URL:", url) - print(f"[LOG] Call LLM for {url} - block index: {ix}") + if self.verbose: + # print("[LOG] Extracting blocks from URL:", url) + print(f"[LOG] Call LLM for {url} - block index: {ix}") + variable_values = { "URL": url, "HTML": escape_json_string(sanitize_html(html)), @@ -868,4 +870,4 @@ class JsonXPATHExtractionStrategy(ExtractionStrategy): def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: combined_html = self.DEL.join(sections) - return self.extract(url, combined_html, **kwargs) \ No newline at end of file + return self.extract(url, combined_html, **kwargs) From 2d31915f0a0b8f1e5cecfaff0514423c20b6daeb Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 9 Dec 2024 20:04:59 +0800 Subject: [PATCH 57/70] Commit Message: Enhance Async Crawler with storage state handling - Updated Async Crawler to support storage state management. - Added error handling for URL validation in Async Web Crawler. - Modified README logo and improved .gitignore entries. - Fixed issues in multiple files for better code robustness. --- .gitignore | 5 ++- README.md | 2 +- crawl4ai/async_crawler_strategy.py | 54 ++++++++++++++++++++++++------ crawl4ai/async_webcrawler.py | 7 +++- crawl4ai/extraction_strategy.py | 2 +- crawl4ai/utils.py | 1 + main.py | 2 +- 7 files changed, 58 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index 52e25a2a..02c75b3f 100644 --- a/.gitignore +++ b/.gitignore @@ -214,4 +214,7 @@ git_issues.md todo_executor.md protect-all-except-feature.sh manage-collab.sh -publish.sh \ No newline at end of file +publish.sh + +combine.sh +combined_output.txt \ No newline at end of file diff --git a/README.md b/README.md index dede4a03..095c595c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# 🔥🕷️ Crawl4AI: Crawl Smarter, Faster, Freely. For AI. +# 🚀🤖 Crawl4AI: Crawl Smarter, Faster, Freely. For AI. unclecode%2Fcrawl4ai | Trendshift diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 5c706239..fca0c0ec 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -238,8 +238,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.user_agent = kwargs.get( "user_agent", - # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" - "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36" + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" + # "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36" ) user_agenr_generator = UserAgentGenerator() if kwargs.get("user_agent_mode") == "random": @@ -254,6 +254,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent) self.headers.setdefault("sec-ch-ua", self.browser_hint) self.cookies = kwargs.get("cookies", []) + self.storage_state = kwargs.get("storage_state", None) self.sessions = {} self.session_ttl = 1800 self.js_code = js_code @@ -315,7 +316,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # If no default context exists, create one self.default_context = await self.browser.new_context( # viewport={"width": 1920, "height": 1080} - viewport={"width": self.viewport_width, "height": self.viewport_height} + viewport={"width": self.viewport_width, "height": self.viewport_height}, + storage_state=self.storage_state, ) # Set up the default context @@ -323,6 +325,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.default_context.set_extra_http_headers(self.headers) if self.cookies: await self.default_context.add_cookies(self.cookies) + if self.storage_state: + # If storage_state is a dictionary or file path, Playwright will handle it. + await self.default_context.storage_state(path=None) # Just ensuring default_context is ready if self.accept_downloads: await self.default_context.set_default_timeout(60000) await self.default_context.set_default_navigation_timeout(60000) @@ -426,6 +431,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.default_context = self.browser else: self.browser = await self.playwright.chromium.launch(**browser_args) + self.default_context = self.browser except Exception as e: # Fallback to chromium if Chrome channel fails @@ -643,6 +649,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): viewport={"width": self.viewport_width, "height": self.viewport_height}, proxy={"server": self.proxy} if self.proxy else None, accept_downloads=self.accept_downloads, + storage_state=self.storage_state, ignore_https_errors=True ) @@ -771,6 +778,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): proxy={"server": self.proxy} if self.proxy else None, java_script_enabled=True, accept_downloads=self.accept_downloads, + storage_state=self.storage_state, # downloads_path=self.downloads_path if self.accept_downloads else None ) await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) @@ -792,6 +800,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): viewport={"width": self.viewport_width, "height": self.viewport_height}, proxy={"server": self.proxy} if self.proxy else None, accept_downloads=self.accept_downloads, + storage_state=self.storage_state, ignore_https_errors=True # Add this line ) if self.cookies: @@ -862,7 +871,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return response if not kwargs.get("js_only", False): - await self.execute_hook('before_goto', page, context = context) + await self.execute_hook('before_goto', page, context = context, **kwargs) try: response = await page.goto( @@ -874,7 +883,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Error as e: raise RuntimeError(f"Failed on navigating ACS-GOTO :\n{str(e)}") - await self.execute_hook('after_goto', page, context = context) + await self.execute_hook('after_goto', page, context = context, **kwargs) # Get status code and headers status_code = response.status @@ -929,9 +938,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # CONTENT LOADING ASSURANCE if not self.text_only and (kwargs.get("wait_for_images", True) or kwargs.get("adjust_viewport_to_content", False)): # Wait for network idle after initial load and images to load - await page.wait_for_load_state("networkidle") + # await page.wait_for_load_state("networkidle") + await page.wait_for_load_state("domcontentloaded") await asyncio.sleep(0.1) - await page.wait_for_function("Array.from(document.images).every(img => img.complete)") + from playwright.async_api import TimeoutError as PlaywrightTimeoutError + try: + await page.wait_for_function("Array.from(document.images).every(img => img.complete)", timeout=1000) + # Check for TimeoutError and ignore it + except PlaywrightTimeoutError: + pass # After initial load, adjust viewport to content size if not self.text_only and kwargs.get("adjust_viewport_to_content", False): @@ -1015,7 +1030,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # await page.wait_for_timeout(100) # Check for on execution event - await self.execute_hook('on_execution_started', page, context = context) + await self.execute_hook('on_execution_started', page, context = context, **kwargs) if kwargs.get("simulate_user", False) or kwargs.get("magic", False): # Simulate user interactions @@ -1119,7 +1134,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if kwargs.get("process_iframes", False): page = await self.process_iframes(page) - await self.execute_hook('before_retrieve_html', page, context = context) + await self.execute_hook('before_retrieve_html', page, context = context, **kwargs) # Check if delay_before_return_html is set then wait for that time delay_before_return_html = kwargs.get("delay_before_return_html", 0.1) if delay_before_return_html: @@ -1130,7 +1145,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.remove_overlay_elements(page) html = await page.content() - await self.execute_hook('before_return_html', page, html, context = context) + await self.execute_hook('before_return_html', page, html, context = context, **kwargs) # Check if kwargs has screenshot=True then take screenshot screenshot_data = None @@ -1394,6 +1409,25 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return base64.b64encode(buffered.getvalue()).decode('utf-8') finally: await page.close() + + async def export_storage_state(self, path: str = None) -> dict: + """ + Exports the current storage state (cookies, localStorage, sessionStorage) + to a JSON file at the specified path. + """ + if self.default_context: + state = await self.default_context.storage_state(path=path) + self.logger.info( + message="Exported storage state to {path}", + tag="INFO", + params={"path": path} + ) + return state + else: + self.logger.warning( + message="No default_context available to export storage state.", + tag="WARNING" + ) async def _generate_screenshot_from_html(self, html: str) -> Optional[str]: """ diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 2c17602d..b872c20c 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -182,6 +182,10 @@ class AsyncWebCrawler: Returns: CrawlResult: The result of crawling and processing """ + # Check if url is not string and is not empty + if not isinstance(url, str) or not url: + raise ValueError("Invalid URL, make sure the URL is a non-empty string") + async with self._lock or nullcontext(): try: # Handle deprecated parameters @@ -335,7 +339,8 @@ class AsyncWebCrawler: # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") self.logger.error_status( - url=cache_context.display_url, + # url=cache_context.display_url, + url=url, error=create_box_message(e.msg, type = "error"), tag="ERROR" ) diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index b79e0c43..a778bf4d 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -632,7 +632,7 @@ class ContentSummarizationStrategy(ExtractionStrategy): # Sort summaries by the original section index to maintain order summaries.sort(key=lambda x: x[0]) return [summary for _, summary in summaries] - + class JsonCssExtractionStrategy(ExtractionStrategy): def __init__(self, schema: Dict[str, Any], **kwargs): super().__init__(**kwargs) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 0a9e6f56..879ba562 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -147,6 +147,7 @@ class CustomHTML2Text(HTML2Text): # self.preserved_content.append(data) # return # super().handle_data(data, entity_char) + class InvalidCSSSelectorError(Exception): pass diff --git a/main.py b/main.py index d6c792e8..21d3de16 100644 --- a/main.py +++ b/main.py @@ -342,7 +342,7 @@ app.add_middleware( # API token security security = HTTPBearer() -CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code" +CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)): if not CRAWL4AI_API_TOKEN: From ded554d3345ca00c038274fc38ff43b28b45cdd8 Mon Sep 17 00:00:00 2001 From: Mohammed Date: Mon, 9 Dec 2024 07:17:43 -0500 Subject: [PATCH 58/70] Fixed typo (#324) --- docs/md_v2/basic/quickstart.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/md_v2/basic/quickstart.md b/docs/md_v2/basic/quickstart.md index 95b8a397..c18cd7d1 100644 --- a/docs/md_v2/basic/quickstart.md +++ b/docs/md_v2/basic/quickstart.md @@ -8,7 +8,7 @@ First, let's import the necessary modules and create an instance of `AsyncWebCra ```python import asyncio -from crawl4ai import AsyncWebCrawler, CasheMode +from crawl4ai import AsyncWebCrawler, CacheMode async def main(): async with AsyncWebCrawler(verbose=True) as crawler: From e130fd8db9bbb9323e800efb6875775b468c421c Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 10 Dec 2024 17:55:29 +0800 Subject: [PATCH 59/70] Implement new async crawler features and stability updates - Introduced new async crawl strategy with session management. - Added BrowserManager for improved browser management. - Enhanced documentation, focusing on storage state and usage examples. - Improved error handling and logging for sessions. - Added JavaScript snippets for customizing navigator properties. --- crawl4ai/async_crawler_strategy.current.py | 1475 +++++++++++++++++ crawl4ai/async_crawler_strategy.py | 771 ++++----- crawl4ai/async_tools.py | 183 ++ crawl4ai/async_webcrawler.py | 13 +- crawl4ai/content_scraping_strategy.py | 72 +- crawl4ai/html2text/__init__.py | 128 +- crawl4ai/js_snippet/__init__.py | 15 + crawl4ai/js_snippet/navigator_overrider.js | 25 + .../js_snippet/remove_overlay_elements.js | 119 ++ .../js_snippet/update_image_dimensions.js | 54 + crawl4ai/markdown_generation_strategy.py | 23 +- crawl4ai/tools.py | 34 - crawl4ai/utils.py | 207 +-- docs/examples/storage_state_tutorial.md | 225 +++ docs/md_v2/basic/quickstart.md | 2 +- tests/async/test_0.4.2_browser_manager.py | 153 ++ 16 files changed, 2750 insertions(+), 749 deletions(-) create mode 100644 crawl4ai/async_crawler_strategy.current.py create mode 100644 crawl4ai/async_tools.py create mode 100644 crawl4ai/js_snippet/__init__.py create mode 100644 crawl4ai/js_snippet/navigator_overrider.js create mode 100644 crawl4ai/js_snippet/remove_overlay_elements.js create mode 100644 crawl4ai/js_snippet/update_image_dimensions.js delete mode 100644 crawl4ai/tools.py create mode 100644 docs/examples/storage_state_tutorial.md create mode 100644 tests/async/test_0.4.2_browser_manager.py diff --git a/crawl4ai/async_crawler_strategy.current.py b/crawl4ai/async_crawler_strategy.current.py new file mode 100644 index 00000000..6302447c --- /dev/null +++ b/crawl4ai/async_crawler_strategy.current.py @@ -0,0 +1,1475 @@ +import asyncio +import base64 +import time +from abc import ABC, abstractmethod +from typing import Callable, Dict, Any, List, Optional, Awaitable +import os, sys, shutil +import tempfile, subprocess +from playwright.async_api import async_playwright, Page, Browser, Error +from playwright.async_api import TimeoutError as PlaywrightTimeoutError +from io import BytesIO +from PIL import Image, ImageDraw, ImageFont +from pathlib import Path +from playwright.async_api import ProxySettings +from pydantic import BaseModel +import hashlib +import json +import uuid +from .models import AsyncCrawlResponse +from .utils import create_box_message +from .user_agent_generator import UserAgentGenerator +from playwright_stealth import StealthConfig, stealth_async + +stealth_config = StealthConfig( + webdriver=True, + chrome_app=True, + chrome_csi=True, + chrome_load_times=True, + chrome_runtime=True, + navigator_languages=True, + navigator_plugins=True, + navigator_permissions=True, + webgl_vendor=True, + outerdimensions=True, + navigator_hardware_concurrency=True, + media_codecs=True, +) + +BROWSER_DISABLE_OPTIONS = [ + "--disable-background-networking", + "--disable-background-timer-throttling", + "--disable-backgrounding-occluded-windows", + "--disable-breakpad", + "--disable-client-side-phishing-detection", + "--disable-component-extensions-with-background-pages", + "--disable-default-apps", + "--disable-extensions", + "--disable-features=TranslateUI", + "--disable-hang-monitor", + "--disable-ipc-flooding-protection", + "--disable-popup-blocking", + "--disable-prompt-on-repost", + "--disable-sync", + "--force-color-profile=srgb", + "--metrics-recording-only", + "--no-first-run", + "--password-store=basic", + "--use-mock-keychain" +] + + +class ManagedBrowser: + def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None, host: str = "localhost", debugging_port: int = 9222): + self.browser_type = browser_type + self.user_data_dir = user_data_dir + self.headless = headless + self.browser_process = None + self.temp_dir = None + self.debugging_port = debugging_port + self.host = host + self.logger = logger + self.shutting_down = False + + async def start(self) -> str: + """ + Starts the browser process and returns the CDP endpoint URL. + If user_data_dir is not provided, creates a temporary directory. + """ + + # Create temp dir if needed + if not self.user_data_dir: + self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-") + self.user_data_dir = self.temp_dir + + # Get browser path and args based on OS and browser type + browser_path = self._get_browser_path() + args = self._get_browser_args() + + # Start browser process + try: + self.browser_process = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + # Monitor browser process output for errors + asyncio.create_task(self._monitor_browser_process()) + await asyncio.sleep(2) # Give browser time to start + return f"http://{self.host}:{self.debugging_port}" + except Exception as e: + await self.cleanup() + raise Exception(f"Failed to start browser: {e}") + + async def _monitor_browser_process(self): + """Monitor the browser process for unexpected termination.""" + if self.browser_process: + try: + stdout, stderr = await asyncio.gather( + asyncio.to_thread(self.browser_process.stdout.read), + asyncio.to_thread(self.browser_process.stderr.read) + ) + + # Check shutting_down flag BEFORE logging anything + if self.browser_process.poll() is not None: + if not self.shutting_down: + self.logger.error( + message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", + tag="ERROR", + params={ + "code": self.browser_process.returncode, + "stdout": stdout.decode(), + "stderr": stderr.decode() + } + ) + await self.cleanup() + else: + self.logger.info( + message="Browser process terminated normally | Code: {code}", + tag="INFO", + params={"code": self.browser_process.returncode} + ) + except Exception as e: + if not self.shutting_down: + self.logger.error( + message="Error monitoring browser process: {error}", + tag="ERROR", + params={"error": str(e)} + ) + + def _get_browser_path(self) -> str: + """Returns the browser executable path based on OS and browser type""" + if sys.platform == "darwin": # macOS + paths = { + "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox", + "webkit": "/Applications/Safari.app/Contents/MacOS/Safari" + } + elif sys.platform == "win32": # Windows + paths = { + "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe", + "webkit": None # WebKit not supported on Windows + } + else: # Linux + paths = { + "chromium": "google-chrome", + "firefox": "firefox", + "webkit": None # WebKit not supported on Linux + } + + return paths.get(self.browser_type) + + def _get_browser_args(self) -> List[str]: + """Returns browser-specific command line arguments""" + base_args = [self._get_browser_path()] + + if self.browser_type == "chromium": + args = [ + f"--remote-debugging-port={self.debugging_port}", + f"--user-data-dir={self.user_data_dir}", + ] + if self.headless: + args.append("--headless=new") + elif self.browser_type == "firefox": + args = [ + "--remote-debugging-port", str(self.debugging_port), + "--profile", self.user_data_dir, + ] + if self.headless: + args.append("--headless") + else: + raise NotImplementedError(f"Browser type {self.browser_type} not supported") + + return base_args + args + + async def cleanup(self): + """Cleanup browser process and temporary directory""" + # Set shutting_down flag BEFORE any termination actions + self.shutting_down = True + + if self.browser_process: + try: + self.browser_process.terminate() + # Wait for process to end gracefully + for _ in range(10): # 10 attempts, 100ms each + if self.browser_process.poll() is not None: + break + await asyncio.sleep(0.1) + + # Force kill if still running + if self.browser_process.poll() is None: + self.browser_process.kill() + await asyncio.sleep(0.1) # Brief wait for kill to take effect + + except Exception as e: + self.logger.error( + message="Error terminating browser: {error}", + tag="ERROR", + params={"error": str(e)} + ) + + if self.temp_dir and os.path.exists(self.temp_dir): + try: + shutil.rmtree(self.temp_dir) + except Exception as e: + self.logger.error( + message="Error removing temporary directory: {error}", + tag="ERROR", + params={"error": str(e)} + ) + + +class AsyncCrawlerStrategy(ABC): + @abstractmethod + async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: + pass + + @abstractmethod + async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: + pass + + @abstractmethod + async def take_screenshot(self, **kwargs) -> str: + pass + + @abstractmethod + def update_user_agent(self, user_agent: str): + pass + + @abstractmethod + def set_hook(self, hook_type: str, hook: Callable): + pass + +class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + def __init__(self, use_cached_html=False, js_code=None, logger = None, **kwargs): + self.text_only = kwargs.get("text_only", False) + self.light_mode = kwargs.get("light_mode", False) + self.logger = logger + self.use_cached_html = use_cached_html + self.viewport_width = kwargs.get("viewport_width", 800 if self.text_only else 1920) + self.viewport_height = kwargs.get("viewport_height", 600 if self.text_only else 1080) + + if self.text_only: + self.extra_args = kwargs.get("extra_args", []) + [ + '--disable-images', + '--disable-javascript', + '--disable-gpu', + '--disable-software-rasterizer', + '--disable-dev-shm-usage' + ] + + self.user_agent = kwargs.get( + "user_agent", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" + # "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36" + ) + user_agenr_generator = UserAgentGenerator() + if kwargs.get("user_agent_mode") == "random": + self.user_agent = user_agenr_generator.generate( + **kwargs.get("user_agent_generator_config", {}) + ) + self.proxy = kwargs.get("proxy") + self.proxy_config = kwargs.get("proxy_config") + self.headless = kwargs.get("headless", True) + self.browser_type = kwargs.get("browser_type", "chromium") + self.headers = kwargs.get("headers", {}) + self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent) + self.headers.setdefault("sec-ch-ua", self.browser_hint) + self.cookies = kwargs.get("cookies", []) + self.storage_state = kwargs.get("storage_state", None) + self.sessions = {} + self.session_ttl = 1800 + self.js_code = js_code + self.verbose = kwargs.get("verbose", False) + self.playwright = None + self.browser = None + self.sleep_on_close = kwargs.get("sleep_on_close", False) + self.use_managed_browser = kwargs.get("use_managed_browser", False) + self.user_data_dir = kwargs.get("user_data_dir", None) + self.use_persistent_context = kwargs.get("use_persistent_context", False) + self.chrome_channel = kwargs.get("chrome_channel", "chrome") + self.managed_browser = None + self.default_context = None + self.hooks = { + 'on_browser_created': None, + 'on_user_agent_updated': None, + 'on_execution_started': None, + 'before_goto': None, + 'after_goto': None, + 'before_return_html': None, + 'before_retrieve_html': None + } + self.extra_args = kwargs.get("extra_args", []) + self.ignore_https_errors = kwargs.get("ignore_https_errors", True) + self.java_script_enabled = kwargs.get("java_script_enabled", True) + self.accept_downloads = kwargs.get("accept_downloads", False) + self.downloads_path = kwargs.get("downloads_path") + self._downloaded_files = [] # Track downloaded files for current crawl + if self.accept_downloads and not self.downloads_path: + self.downloads_path = os.path.join(os.getcwd(), "downloads") + os.makedirs(self.downloads_path, exist_ok=True) + + + async def __aenter__(self): + await self.start() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + await self.close() + + async def start(self): + if self.playwright is None: + self.playwright = await async_playwright().start() + if self.browser is None: + if self.use_managed_browser: + # Use managed browser approach + self.managed_browser = ManagedBrowser( + browser_type=self.browser_type, + user_data_dir=self.user_data_dir, + headless=self.headless, + logger=self.logger + ) + cdp_url = await self.managed_browser.start() + self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) + + # Get the default context that maintains the user profile + contexts = self.browser.contexts + if contexts: + self.default_context = contexts[0] + else: + # If no default context exists, create one + self.default_context = await self.browser.new_context( + viewport={"width": self.viewport_width, "height": self.viewport_height}, + storage_state=self.storage_state, + user_agent= self.user_agent, + accept_downloads=self.accept_downloads, + ignore_https_errors=self.ignore_https_errors, + java_script_enabled=self.java_script_enabled, + ) + + # Set up the default context + if self.default_context: + await self.default_context.set_extra_http_headers(self.headers) + if self.cookies: + await self.default_context.add_cookies(self.cookies) + if self.storage_state: + # If storage_state is a dictionary or file path, Playwright will handle it. + await self.default_context.storage_state(path=None) # Just ensuring default_context is ready + if self.accept_downloads: + await self.default_context.set_default_timeout(60000) + await self.default_context.set_default_navigation_timeout(60000) + self.default_context._impl_obj._options["accept_downloads"] = True + self.default_context._impl_obj._options["downloads_path"] = self.downloads_path + + if self.user_agent: + await self.default_context.set_extra_http_headers({ + "User-Agent": self.user_agent, + "sec-ch-ua": self.browser_hint, + # **self.headers + }) + else: + # Base browser arguments + browser_args = { + "headless": self.headless, + "args": [ + "--no-sandbox", + "--disable-dev-shm-usage", + "--no-first-run", + "--no-default-browser-check", + "--disable-infobars", + "--window-position=0,0", + "--ignore-certificate-errors", + "--ignore-certificate-errors-spki-list", + "--disable-blink-features=AutomationControlled", + "--window-position=400,0", + f"--window-size={self.viewport_width},{self.viewport_height}", + ] + } + + if self.light_mode: + browser_args["args"].extend(BROWSER_DISABLE_OPTIONS) + + if self.text_only: + browser_args["args"].extend([ + '--blink-settings=imagesEnabled=false', + '--disable-remote-fonts' + ]) + + # Add channel if specified (try Chrome first) + if self.chrome_channel: + browser_args["channel"] = self.chrome_channel + + # Add extra args if provided + if self.extra_args: + browser_args["args"].extend(self.extra_args) + + # Add downloads path if downloads are enabled + if self.accept_downloads: + browser_args["downloads_path"] = self.downloads_path + + # Add proxy settings if a proxy is specified + if self.proxy: + proxy_settings = ProxySettings(server=self.proxy) + browser_args["proxy"] = proxy_settings + elif self.proxy_config: + proxy_settings = ProxySettings( + server=self.proxy_config.get("server"), + username=self.proxy_config.get("username"), + password=self.proxy_config.get("password") + ) + browser_args["proxy"] = proxy_settings + + try: + # Select the appropriate browser based on the browser_type + if self.browser_type == "firefox": + self.browser = await self.playwright.firefox.launch(**browser_args) + elif self.browser_type == "webkit": + if "viewport" not in browser_args: + browser_args["viewport"] = {"width": self.viewport_width, "height": self.viewport_height} + self.browser = await self.playwright.webkit.launch(**browser_args) + else: + if self.use_persistent_context and self.user_data_dir: + self.browser = await self.playwright.chromium.launch_persistent_context( + user_data_dir=self.user_data_dir, + accept_downloads=self.accept_downloads, + downloads_path=self.downloads_path if self.accept_downloads else None, + **browser_args + ) + self.default_context = self.browser + else: + self.browser = await self.playwright.chromium.launch(**browser_args) + self.default_context = self.browser + + except Exception as e: + # Fallback to chromium if Chrome channel fails + if "chrome" in str(e) and browser_args.get("channel") == "chrome": + browser_args["channel"] = "chromium" + if self.use_persistent_context and self.user_data_dir: + self.browser = await self.playwright.chromium.launch_persistent_context( + user_data_dir=self.user_data_dir, + **browser_args + ) + self.default_context = self.browser + else: + self.browser = await self.playwright.chromium.launch(**browser_args) + else: + raise + + await self.execute_hook('on_browser_created', self.browser) + + async def close(self): + if self.sleep_on_close: + await asyncio.sleep(0.5) + + # Close all active sessions + session_ids = list(self.sessions.keys()) + for session_id in session_ids: + await self.kill_session(session_id) + + if self.browser: + await self.browser.close() + self.browser = None + + if self.managed_browser: + await asyncio.sleep(0.5) + await self.managed_browser.cleanup() + self.managed_browser = None + + if self.playwright: + await self.playwright.stop() + self.playwright = None + + # Issue #256: Remove __del__ method to avoid potential issues with async cleanup + # def __del__(self): + # if self.browser or self.playwright: + # asyncio.get_event_loop().run_until_complete(self.close()) + + def set_hook(self, hook_type: str, hook: Callable): + if hook_type in self.hooks: + self.hooks[hook_type] = hook + else: + raise ValueError(f"Invalid hook type: {hook_type}") + + async def execute_hook(self, hook_type: str, *args, **kwargs): + hook = self.hooks.get(hook_type) + if hook: + if asyncio.iscoroutinefunction(hook): + return await hook(*args, **kwargs) + else: + return hook(*args, **kwargs) + return args[0] if args else None + + def update_user_agent(self, user_agent: str): + self.user_agent = user_agent + + def set_custom_headers(self, headers: Dict[str, str]): + self.headers = headers + + async def kill_session(self, session_id: str): + if session_id in self.sessions: + context, page, _ = self.sessions[session_id] + await page.close() + if not self.use_managed_browser: + await context.close() + del self.sessions[session_id] + + def _cleanup_expired_sessions(self): + current_time = time.time() + expired_sessions = [ + sid for sid, (_, _, last_used) in self.sessions.items() + if current_time - last_used > self.session_ttl + ] + for sid in expired_sessions: + asyncio.create_task(self.kill_session(sid)) + + async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): + wait_for = wait_for.strip() + + if wait_for.startswith('js:'): + # Explicitly specified JavaScript + js_code = wait_for[3:].strip() + return await self.csp_compliant_wait(page, js_code, timeout) + elif wait_for.startswith('css:'): + # Explicitly specified CSS selector + css_selector = wait_for[4:].strip() + try: + await page.wait_for_selector(css_selector, timeout=timeout) + except Error as e: + if 'Timeout' in str(e): + raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'") + else: + raise ValueError(f"Invalid CSS selector: '{css_selector}'") + else: + # Auto-detect based on content + if wait_for.startswith('()') or wait_for.startswith('function'): + # It's likely a JavaScript function + return await self.csp_compliant_wait(page, wait_for, timeout) + else: + # Assume it's a CSS selector first + try: + await page.wait_for_selector(wait_for, timeout=timeout) + except Error as e: + if 'Timeout' in str(e): + raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'") + else: + # If it's not a timeout error, it might be an invalid selector + # Let's try to evaluate it as a JavaScript function as a fallback + try: + return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout) + except Error: + raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. " + "It should be either a valid CSS selector, a JavaScript function, " + "or explicitly prefixed with 'js:' or 'css:'.") + + async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000): + wrapper_js = f""" + async () => {{ + const userFunction = {user_wait_function}; + const startTime = Date.now(); + while (true) {{ + if (await userFunction()) {{ + return true; + }} + if (Date.now() - startTime > {timeout}) {{ + throw new Error('Timeout waiting for condition'); + }} + await new Promise(resolve => setTimeout(resolve, 100)); + }} + }} + """ + + try: + await page.evaluate(wrapper_js) + except TimeoutError: + raise TimeoutError(f"Timeout after {timeout}ms waiting for condition") + except Exception as e: + raise RuntimeError(f"Error in wait condition: {str(e)}") + + async def process_iframes(self, page): + # Find all iframes + iframes = await page.query_selector_all('iframe') + + for i, iframe in enumerate(iframes): + try: + # Add a unique identifier to the iframe + await iframe.evaluate(f'(element) => element.id = "iframe-{i}"') + + # Get the frame associated with this iframe + frame = await iframe.content_frame() + + if frame: + # Wait for the frame to load + await frame.wait_for_load_state('load', timeout=30000) # 30 seconds timeout + + # Extract the content of the iframe's body + iframe_content = await frame.evaluate('() => document.body.innerHTML') + + # Generate a unique class name for this iframe + class_name = f'extracted-iframe-content-{i}' + + # Replace the iframe with a div containing the extracted content + _iframe = iframe_content.replace('`', '\\`') + await page.evaluate(f""" + () => {{ + const iframe = document.getElementById('iframe-{i}'); + const div = document.createElement('div'); + div.innerHTML = `{_iframe}`; + div.className = '{class_name}'; + iframe.replaceWith(div); + }} + """) + else: + # print(f"Warning: Could not access content frame for iframe {i}") + self.logger.warning( + message="Could not access content frame for iframe {index}", + tag="SCRAPE", + params={"index": i} + ) + except Exception as e: + self.logger.error( + message="Error processing iframe {index}: {error}", + tag="ERROR", + params={"index": i, "error": str(e)} + ) + # print(f"Error processing iframe {i}: {str(e)}") + + # Return the page object + return page + + async def create_session(self, **kwargs) -> str: + """Creates a new browser session and returns its ID.""" + if not self.browser: + await self.start() + + session_id = kwargs.get('session_id') or str(uuid.uuid4()) + + if self.use_managed_browser: + page = await self.default_context.new_page() + self.sessions[session_id] = (self.default_context, page, time.time()) + else: + if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: + context = self.browser + page = await context.new_page() + else: + context = await self.browser.new_context( + user_agent=kwargs.get("user_agent", self.user_agent), + viewport={"width": self.viewport_width, "height": self.viewport_height}, + proxy={"server": self.proxy} if self.proxy else None, + accept_downloads=self.accept_downloads, + storage_state=self.storage_state, + ignore_https_errors=True + ) + + if self.cookies: + await context.add_cookies(self.cookies) + await context.set_extra_http_headers(self.headers) + page = await context.new_page() + + self.sessions[session_id] = (context, page, time.time()) + + return session_id + + async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: + """ + Crawls a given URL or processes raw HTML/local file content based on the URL prefix. + + Args: + url (str): The URL to crawl. Supported prefixes: + - 'http://' or 'https://': Web URL to crawl. + - 'file://': Local file path to process. + - 'raw:': Raw HTML content to process. + **kwargs: Additional parameters: + - 'screenshot' (bool): Whether to take a screenshot. + - ... [other existing parameters] + + Returns: + AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot. + """ + response_headers = {} + status_code = 200 # Default to 200 for local/raw HTML + screenshot_requested = kwargs.get('screenshot', False) + screenshot_data = None + + if url.startswith(('http://', 'https://')): + # Proceed with standard web crawling + return await self._crawl_web(url, **kwargs) + + elif url.startswith('file://'): + # Process local file + local_file_path = url[7:] # Remove 'file://' prefix + if not os.path.exists(local_file_path): + raise FileNotFoundError(f"Local file not found: {local_file_path}") + with open(local_file_path, 'r', encoding='utf-8') as f: + html = f.read() + if screenshot_requested: + screenshot_data = await self._generate_screenshot_from_html(html) + return AsyncCrawlResponse( + html=html, + response_headers=response_headers, + status_code=status_code, + screenshot=screenshot_data, + get_delayed_content=None + ) + + elif url.startswith('raw:'): + # Process raw HTML content + raw_html = url[4:] # Remove 'raw:' prefix + html = raw_html + if screenshot_requested: + screenshot_data = await self._generate_screenshot_from_html(html) + return AsyncCrawlResponse( + html=html, + response_headers=response_headers, + status_code=status_code, + screenshot=screenshot_data, + get_delayed_content=None + ) + else: + raise ValueError("URL must start with 'http://', 'https://', 'file://', or 'raw:'") + + + async def _crawl_web(self, url: str, **kwargs) -> AsyncCrawlResponse: + """ + Existing web crawling logic remains unchanged. + + Args: + url (str): The web URL to crawl. + **kwargs: Additional parameters. + + Returns: + AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot. + """ + response_headers = {} + status_code = None + + # Reset downloaded files list for new crawl + self._downloaded_files = [] + + self._cleanup_expired_sessions() + session_id = kwargs.get("session_id") + + # Check if in kwargs we have user_agent that will override the default user_agent + user_agent = kwargs.get("user_agent", self.user_agent) + + # Generate random user agent if magic mode is enabled and user_agent_mode is not random + if kwargs.get("user_agent_mode") != "random" and kwargs.get("magic", False): + user_agent = UserAgentGenerator().generate( + **kwargs.get("user_agent_generator_config", {}) + ) + + # Handle page creation differently for managed browser + context = None + if self.use_managed_browser: + if session_id: + # Reuse existing session if available + context, page, _ = self.sessions.get(session_id, (None, None, None)) + if not page: + # Create new page in default context if session doesn't exist + page = await self.default_context.new_page() + self.sessions[session_id] = (self.default_context, page, time.time()) + else: + # Create new page in default context for non-session requests + page = await self.default_context.new_page() + else: + if session_id: + context, page, _ = self.sessions.get(session_id, (None, None, None)) + if not context: + if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: + # In persistent context, browser is the context + context = self.browser + else: + # Normal context creation for non-persistent or non-Chrome browsers + context = await self.browser.new_context( + user_agent=user_agent, + viewport={"width": self.viewport_width, "height": self.viewport_height}, + proxy={"server": self.proxy} if self.proxy else None, + java_script_enabled=True, + accept_downloads=self.accept_downloads, + storage_state=self.storage_state, + # downloads_path=self.downloads_path if self.accept_downloads else None + ) + await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) + if self.cookies: + await context.add_cookies(self.cookies) + await context.set_extra_http_headers(self.headers) + + page = await context.new_page() + self.sessions[session_id] = (context, page, time.time()) + else: + if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: + # In persistent context, browser is the context + context = self.browser + else: + # Normal context creation + context = await self.browser.new_context( + user_agent=user_agent, + # viewport={"width": 1920, "height": 1080}, + viewport={"width": self.viewport_width, "height": self.viewport_height}, + proxy={"server": self.proxy} if self.proxy else None, + accept_downloads=self.accept_downloads, + storage_state=self.storage_state, + ignore_https_errors=True # Add this line + ) + if self.cookies: + await context.add_cookies(self.cookies) + await context.set_extra_http_headers(self.headers) + + if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False): + # Inject scripts to override navigator properties + await context.add_init_script(""" + // Pass the Permissions Test. + const originalQuery = window.navigator.permissions.query; + window.navigator.permissions.query = (parameters) => ( + parameters.name === 'notifications' ? + Promise.resolve({ state: Notification.permission }) : + originalQuery(parameters) + ); + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }); + window.navigator.chrome = { + runtime: {}, + // Add other properties if necessary + }; + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5], + }); + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en'], + }); + Object.defineProperty(document, 'hidden', { + get: () => false + }); + Object.defineProperty(document, 'visibilityState', { + get: () => 'visible' + }); + """) + + page = await context.new_page() + if kwargs.get("magic", False): + await stealth_async(page, stealth_config) + + # Add console message and error logging + if kwargs.get("log_console", False): + page.on("console", lambda msg: print(f"Console: {msg.text}")) + page.on("pageerror", lambda exc: print(f"Page Error: {exc}")) + + try: + # Set up download handling if enabled + if self.accept_downloads: + page.on("download", lambda download: asyncio.create_task(self._handle_download(download))) + + if self.use_cached_html: + cache_file_path = os.path.join( + os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() + ) + if os.path.exists(cache_file_path): + html = "" + with open(cache_file_path, "r") as f: + html = f.read() + # retrieve response headers and status code from cache + with open(cache_file_path + ".meta", "r") as f: + meta = json.load(f) + response_headers = meta.get("response_headers", {}) + status_code = meta.get("status_code") + response = AsyncCrawlResponse( + html=html, response_headers=response_headers, status_code=status_code + ) + return response + + if not kwargs.get("js_only", False): + await self.execute_hook('before_goto', page, context = context, **kwargs) + + try: + response = await page.goto( + url, + # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), + wait_until=kwargs.get("wait_until", "domcontentloaded"), + timeout=kwargs.get("page_timeout", 60000), + ) + except Error as e: + raise RuntimeError(f"Failed on navigating ACS-GOTO :\n{str(e)}") + + await self.execute_hook('after_goto', page, context = context, **kwargs) + + # Get status code and headers + status_code = response.status + response_headers = response.headers + else: + status_code = 200 + response_headers = {} + + # Replace the current wait_for_selector line with this more robust check: + try: + # First wait for body to exist, regardless of visibility + await page.wait_for_selector('body', state='attached', timeout=30000) + + # Then wait for it to become visible by checking CSS + await page.wait_for_function(""" + () => { + const body = document.body; + const style = window.getComputedStyle(body); + return style.display !== 'none' && + style.visibility !== 'hidden' && + style.opacity !== '0'; + } + """, timeout=30000) + + except Error as e: + # If waiting fails, let's try to diagnose the issue + visibility_info = await page.evaluate(""" + () => { + const body = document.body; + const style = window.getComputedStyle(body); + return { + display: style.display, + visibility: style.visibility, + opacity: style.opacity, + hasContent: body.innerHTML.length, + classList: Array.from(body.classList) + } + } + """) + + if self.verbose: + print(f"Body visibility debug info: {visibility_info}") + + # Even if body is hidden, we might still want to proceed + if kwargs.get('ignore_body_visibility', True): + if self.verbose: + print("Proceeding despite hidden body...") + pass + else: + raise Error(f"Body element is hidden: {visibility_info}") + + # CONTENT LOADING ASSURANCE + if not self.text_only and (kwargs.get("wait_for_images", True) or kwargs.get("adjust_viewport_to_content", False)): + # Wait for network idle after initial load and images to load + # await page.wait_for_load_state("networkidle") + await page.wait_for_load_state("domcontentloaded") + await asyncio.sleep(0.1) + from playwright.async_api import TimeoutError as PlaywrightTimeoutError + try: + await page.wait_for_function("Array.from(document.images).every(img => img.complete)", timeout=1000) + # Check for TimeoutError and ignore it + except PlaywrightTimeoutError: + pass + + # After initial load, adjust viewport to content size + if not self.text_only and kwargs.get("adjust_viewport_to_content", False): + try: + # Get actual page dimensions + page_width = await page.evaluate("document.documentElement.scrollWidth") + page_height = await page.evaluate("document.documentElement.scrollHeight") + + target_width = self.viewport_width + target_height = int(target_width * page_width / page_height * 0.95) + await page.set_viewport_size({"width": target_width, "height": target_height}) + + # Compute scale factor + # We want the entire page visible: the scale should make both width and height fit + scale = min(target_width / page_width, target_height / page_height) + + # Now we call CDP to set metrics. + # We tell Chrome that the "device" is page_width x page_height in size, + # but we scale it down so everything fits within the real viewport. + cdp = await page.context.new_cdp_session(page) + await cdp.send('Emulation.setDeviceMetricsOverride', { + 'width': page_width, # full page width + 'height': page_height, # full page height + 'deviceScaleFactor': 1, # keep normal DPR + 'mobile': False, + 'scale': scale # scale the entire rendered content + }) + + except Exception as e: + self.logger.warning( + message="Failed to adjust viewport to content: {error}", + tag="VIEWPORT", + params={"error": str(e)} + ) + + # After viewport adjustment, handle page scanning if requested + if kwargs.get("scan_full_page", False): + try: + viewport_height = page.viewport_size.get("height", self.viewport_height) + current_position = viewport_height # Start with one viewport height + scroll_delay = kwargs.get("scroll_delay", 0.2) + + # Initial scroll + await page.evaluate(f"window.scrollTo(0, {current_position})") + await asyncio.sleep(scroll_delay) + + # Get height after first scroll to account for any dynamic content + total_height = await page.evaluate("document.documentElement.scrollHeight") + + while current_position < total_height: + current_position = min(current_position + viewport_height, total_height) + await page.evaluate(f"window.scrollTo(0, {current_position})") + await asyncio.sleep(scroll_delay) + + # Check for dynamic content + new_height = await page.evaluate("document.documentElement.scrollHeight") + if new_height > total_height: + total_height = new_height + + # Scroll back to top + await page.evaluate("window.scrollTo(0, 0)") + + except Exception as e: + self.logger.warning( + message="Failed to perform full page scan: {error}", + tag="PAGE_SCAN", + params={"error": str(e)} + ) + else: + # Scroll to the bottom of the page + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + + js_code = kwargs.get("js_code", kwargs.get("js", self.js_code)) + if js_code: + if isinstance(js_code, str): + await page.evaluate(js_code) + elif isinstance(js_code, list): + for js in js_code: + await page.evaluate(js) + + # await page.wait_for_timeout(100) + + # Check for on execution event + await self.execute_hook('on_execution_started', page, context = context, **kwargs) + + if kwargs.get("simulate_user", False) or kwargs.get("magic", False): + # Simulate user interactions + await page.mouse.move(100, 100) + await page.mouse.down() + await page.mouse.up() + await page.keyboard.press('ArrowDown') + + # Handle the wait_for parameter + wait_for = kwargs.get("wait_for") + if wait_for: + try: + await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000)) + except Exception as e: + raise RuntimeError(f"Wait condition failed: {str(e)}") + + # if not wait_for and js_code: + # await page.wait_for_load_state('networkidle', timeout=5000) + + # Update image dimensions + if not self.text_only: + update_image_dimensions_js = """ + () => { + return new Promise((resolve) => { + const filterImage = (img) => { + // Filter out images that are too small + if (img.width < 100 && img.height < 100) return false; + + // Filter out images that are not visible + const rect = img.getBoundingClientRect(); + if (rect.width === 0 || rect.height === 0) return false; + + // Filter out images with certain class names (e.g., icons, thumbnails) + if (img.classList.contains('icon') || img.classList.contains('thumbnail')) return false; + + // Filter out images with certain patterns in their src (e.g., placeholder images) + if (img.src.includes('placeholder') || img.src.includes('icon')) return false; + + return true; + }; + + const images = Array.from(document.querySelectorAll('img')).filter(filterImage); + let imagesLeft = images.length; + + if (imagesLeft === 0) { + resolve(); + return; + } + + const checkImage = (img) => { + if (img.complete && img.naturalWidth !== 0) { + img.setAttribute('width', img.naturalWidth); + img.setAttribute('height', img.naturalHeight); + imagesLeft--; + if (imagesLeft === 0) resolve(); + } + }; + + images.forEach(img => { + checkImage(img); + if (!img.complete) { + img.onload = () => { + checkImage(img); + }; + img.onerror = () => { + imagesLeft--; + if (imagesLeft === 0) resolve(); + }; + } + }); + + // Fallback timeout of 5 seconds + // setTimeout(() => resolve(), 5000); + resolve(); + }); + } + """ + + try: + try: + await page.wait_for_load_state( + # state="load", + state="domcontentloaded", + timeout=5 + ) + except PlaywrightTimeoutError: + pass + await page.evaluate(update_image_dimensions_js) + except Exception as e: + self.logger.error( + message="Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {error}", + tag="ERROR", + params={"error": str(e)} + ) + # raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}") + + # Wait a bit for any onload events to complete + # await page.wait_for_timeout(100) + + # Process iframes + if kwargs.get("process_iframes", False): + page = await self.process_iframes(page) + + await self.execute_hook('before_retrieve_html', page, context = context, **kwargs) + # Check if delay_before_return_html is set then wait for that time + delay_before_return_html = kwargs.get("delay_before_return_html", 0.1) + if delay_before_return_html: + await asyncio.sleep(delay_before_return_html) + + # Check for remove_overlay_elements parameter + if kwargs.get("remove_overlay_elements", False): + await self.remove_overlay_elements(page) + + html = await page.content() + await self.execute_hook('before_return_html', page, html, context = context, **kwargs) + + # Check if kwargs has screenshot=True then take screenshot + screenshot_data = None + if kwargs.get("screenshot"): + # Check we have screenshot_wait_for parameter, if we have simply wait for that time + screenshot_wait_for = kwargs.get("screenshot_wait_for") + if screenshot_wait_for: + await asyncio.sleep(screenshot_wait_for) + screenshot_data = await self.take_screenshot(page) + + # if self.verbose: + # print(f"[LOG] ✅ Crawled {url} successfully!") + + if self.use_cached_html: + cache_file_path = os.path.join( + os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() + ) + with open(cache_file_path, "w", encoding="utf-8") as f: + f.write(html) + # store response headers and status code in cache + with open(cache_file_path + ".meta", "w", encoding="utf-8") as f: + json.dump({ + "response_headers": response_headers, + "status_code": status_code + }, f) + + async def get_delayed_content(delay: float = 5.0) -> str: + if self.verbose: + print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}") + await asyncio.sleep(delay) + return await page.content() + + response = AsyncCrawlResponse( + html=html, + response_headers=response_headers, + status_code=status_code, + screenshot=screenshot_data, + get_delayed_content=get_delayed_content, + downloaded_files=self._downloaded_files if self._downloaded_files else None + ) + return response + except Error as e: + raise Error(f"async_crawler_strategy.py:_crawleb(): {str(e)}") + # finally: + # if not session_id: + # await page.close() + # await context.close() + + async def _handle_download(self, download): + """Handle file downloads.""" + try: + suggested_filename = download.suggested_filename + download_path = os.path.join(self.downloads_path, suggested_filename) + + self.logger.info( + message="Downloading {filename} to {path}", + tag="FETCH", + params={"filename": suggested_filename, "path": download_path} + ) + + start_time = time.perf_counter() + await download.save_as(download_path) + end_time = time.perf_counter() + self._downloaded_files.append(download_path) + + self.logger.success( + message="Downloaded {filename} successfully", + tag="COMPLETE", + params={"filename": suggested_filename, "path": download_path, "duration": f"{end_time - start_time:.2f}s"} + ) + except Exception as e: + self.logger.error( + message="Failed to handle download: {error}", + tag="ERROR", + params={"error": str(e)} + ) + + # if self.verbose: + # print(f"[ERROR] Failed to handle download: {str(e)}") + + async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: + semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed + semaphore = asyncio.Semaphore(semaphore_count) + + async def crawl_with_semaphore(url): + async with semaphore: + return await self.crawl(url, **kwargs) + + tasks = [crawl_with_semaphore(url) for url in urls] + results = await asyncio.gather(*tasks, return_exceptions=True) + return [result if not isinstance(result, Exception) else str(result) for result in results] + + async def remove_overlay_elements(self, page: Page) -> None: + """ + Removes popup overlays, modals, cookie notices, and other intrusive elements from the page. + + Args: + page (Page): The Playwright page instance + """ + remove_overlays_js = """ + async () => { + // Function to check if element is visible + const isVisible = (elem) => { + const style = window.getComputedStyle(elem); + return style.display !== 'none' && + style.visibility !== 'hidden' && + style.opacity !== '0'; + }; + + // Common selectors for popups and overlays + const commonSelectors = [ + // Close buttons first + 'button[class*="close" i]', 'button[class*="dismiss" i]', + 'button[aria-label*="close" i]', 'button[title*="close" i]', + 'a[class*="close" i]', 'span[class*="close" i]', + + // Cookie notices + '[class*="cookie-banner" i]', '[id*="cookie-banner" i]', + '[class*="cookie-consent" i]', '[id*="cookie-consent" i]', + + // Newsletter/subscription dialogs + '[class*="newsletter" i]', '[class*="subscribe" i]', + + // Generic popups/modals + '[class*="popup" i]', '[class*="modal" i]', + '[class*="overlay" i]', '[class*="dialog" i]', + '[role="dialog"]', '[role="alertdialog"]' + ]; + + // Try to click close buttons first + for (const selector of commonSelectors.slice(0, 6)) { + const closeButtons = document.querySelectorAll(selector); + for (const button of closeButtons) { + if (isVisible(button)) { + try { + button.click(); + await new Promise(resolve => setTimeout(resolve, 100)); + } catch (e) { + console.log('Error clicking button:', e); + } + } + } + } + + // Remove remaining overlay elements + const removeOverlays = () => { + // Find elements with high z-index + const allElements = document.querySelectorAll('*'); + for (const elem of allElements) { + const style = window.getComputedStyle(elem); + const zIndex = parseInt(style.zIndex); + const position = style.position; + + if ( + isVisible(elem) && + (zIndex > 999 || position === 'fixed' || position === 'absolute') && + ( + elem.offsetWidth > window.innerWidth * 0.5 || + elem.offsetHeight > window.innerHeight * 0.5 || + style.backgroundColor.includes('rgba') || + parseFloat(style.opacity) < 1 + ) + ) { + elem.remove(); + } + } + + // Remove elements matching common selectors + for (const selector of commonSelectors) { + const elements = document.querySelectorAll(selector); + elements.forEach(elem => { + if (isVisible(elem)) { + elem.remove(); + } + }); + } + }; + + // Remove overlay elements + removeOverlays(); + + // Remove any fixed/sticky position elements at the top/bottom + const removeFixedElements = () => { + const elements = document.querySelectorAll('*'); + elements.forEach(elem => { + const style = window.getComputedStyle(elem); + if ( + (style.position === 'fixed' || style.position === 'sticky') && + isVisible(elem) + ) { + elem.remove(); + } + }); + }; + + removeFixedElements(); + + // Remove empty block elements as: div, p, span, etc. + const removeEmptyBlockElements = () => { + const blockElements = document.querySelectorAll('div, p, span, section, article, header, footer, aside, nav, main, ul, ol, li, dl, dt, dd, h1, h2, h3, h4, h5, h6'); + blockElements.forEach(elem => { + if (elem.innerText.trim() === '') { + elem.remove(); + } + }); + }; + + // Remove margin-right and padding-right from body (often added by modal scripts) + document.body.style.marginRight = '0px'; + document.body.style.paddingRight = '0px'; + document.body.style.overflow = 'auto'; + + // Wait a bit for any animations to complete + await new Promise(resolve => setTimeout(resolve, 100)); + } + """ + + try: + await page.evaluate(remove_overlays_js) + await page.wait_for_timeout(500) # Wait for any animations to complete + except Exception as e: + self.logger.warning( + message="Failed to remove overlay elements: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) + # if self.verbose: + # print(f"Warning: Failed to remove overlay elements: {str(e)}") + + async def take_screenshot(self, page: Page) -> str: + """ + Takes a screenshot of the current page. + + Args: + page (Page): The Playwright page instance + + Returns: + str: Base64-encoded screenshot image + """ + try: + # The page is already loaded, just take the screenshot + screenshot = await page.screenshot(full_page=True) + return base64.b64encode(screenshot).decode('utf-8') + except Exception as e: + error_message = f"Failed to take screenshot: {str(e)}" + self.logger.error( + message="Screenshot failed: {error}", + tag="ERROR", + params={"error": error_message} + ) + + + # Generate an error image + img = Image.new('RGB', (800, 600), color='black') + draw = ImageDraw.Draw(img) + font = ImageFont.load_default() + draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + + buffered = BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode('utf-8') + finally: + await page.close() + + async def export_storage_state(self, path: str = None) -> dict: + """ + Exports the current storage state (cookies, localStorage, sessionStorage) + to a JSON file at the specified path. + """ + if self.default_context: + state = await self.default_context.storage_state(path=path) + self.logger.info( + message="Exported storage state to {path}", + tag="INFO", + params={"path": path} + ) + return state + else: + self.logger.warning( + message="No default_context available to export storage state.", + tag="WARNING" + ) + + async def _generate_screenshot_from_html(self, html: str) -> Optional[str]: + """ + Generates a screenshot from raw HTML content. + + Args: + html (str): The HTML content to render and capture. + + Returns: + Optional[str]: Base64-encoded screenshot image or an error image if failed. + """ + try: + if not self.browser: + await self.start() + page = await self.browser.new_page() + await page.set_content(html, wait_until='networkidle') + screenshot = await page.screenshot(full_page=True) + await page.close() + return base64.b64encode(screenshot).decode('utf-8') + except Exception as e: + error_message = f"Failed to take screenshot: {str(e)}" + # print(error_message) + self.logger.error( + message="Screenshot failed: {error}", + tag="ERROR", + params={"error": error_message} + ) + + # Generate an error image + img = Image.new('RGB', (800, 600), color='black') + draw = ImageDraw.Draw(img) + font = ImageFont.load_default() + draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + + buffered = BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode('utf-8') + diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index fca0c0ec..1d88c3a8 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -5,7 +5,7 @@ from abc import ABC, abstractmethod from typing import Callable, Dict, Any, List, Optional, Awaitable import os, sys, shutil import tempfile, subprocess -from playwright.async_api import async_playwright, Page, Browser, Error +from playwright.async_api import async_playwright, Page, Browser, Error, BrowserContext from playwright.async_api import TimeoutError as PlaywrightTimeoutError from io import BytesIO from PIL import Image, ImageDraw, ImageFont @@ -15,6 +15,7 @@ from pydantic import BaseModel import hashlib import json import uuid +from .js_snippet import load_js_script from .models import AsyncCrawlResponse from .utils import create_box_message from .user_agent_generator import UserAgentGenerator @@ -35,6 +36,28 @@ stealth_config = StealthConfig( media_codecs=True, ) +BROWSER_DISABLE_OPTIONS = [ + "--disable-background-networking", + "--disable-background-timer-throttling", + "--disable-backgrounding-occluded-windows", + "--disable-breakpad", + "--disable-client-side-phishing-detection", + "--disable-component-extensions-with-background-pages", + "--disable-default-apps", + "--disable-extensions", + "--disable-features=TranslateUI", + "--disable-hang-monitor", + "--disable-ipc-flooding-protection", + "--disable-popup-blocking", + "--disable-prompt-on-repost", + "--disable-sync", + "--force-color-profile=srgb", + "--metrics-recording-only", + "--no-first-run", + "--password-store=basic", + "--use-mock-keychain" +] + class ManagedBrowser: def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None, host: str = "localhost", debugging_port: int = 9222): @@ -197,10 +220,222 @@ class ManagedBrowser: ) +class BrowserManager: + def __init__(self, use_managed_browser: bool, user_data_dir: Optional[str], headless: bool, logger, browser_type: str, proxy, proxy_config, chrome_channel: str, viewport_width: int, viewport_height: int, accept_downloads: bool, storage_state, ignore_https_errors: bool, java_script_enabled: bool, cookies: List[dict], headers: dict, extra_args: List[str], text_only: bool, light_mode: bool, user_agent: str, browser_hint: str, downloads_path: Optional[str]): + self.use_managed_browser = use_managed_browser + self.user_data_dir = user_data_dir + self.headless = headless + self.logger = logger + self.browser_type = browser_type + self.proxy = proxy + self.proxy_config = proxy_config + self.chrome_channel = chrome_channel + self.viewport_width = viewport_width + self.viewport_height = viewport_height + self.accept_downloads = accept_downloads + self.storage_state = storage_state + self.ignore_https_errors = ignore_https_errors + self.java_script_enabled = java_script_enabled + self.cookies = cookies or [] + self.headers = headers or {} + self.extra_args = extra_args or [] + self.text_only = text_only + self.light_mode = light_mode + self.browser = None + self.default_context : BrowserContext = None + self.managed_browser = None + self.sessions = {} + self.session_ttl = 1800 + self.playwright = None + self.user_agent = user_agent + self.browser_hint = browser_hint + self.downloads_path = downloads_path + + async def start(self): + if self.playwright is None: + from playwright.async_api import async_playwright + self.playwright = await async_playwright().start() + + if self.use_managed_browser: + self.managed_browser = ManagedBrowser( + browser_type=self.browser_type, + user_data_dir=self.user_data_dir, + headless=self.headless, + logger=self.logger + ) + cdp_url = await self.managed_browser.start() + self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) + contexts = self.browser.contexts + if contexts: + self.default_context = contexts[0] + else: + self.default_context = await self.browser.new_context( + viewport={"width": self.viewport_width, "height": self.viewport_height}, + storage_state=self.storage_state, + user_agent=self.headers.get("User-Agent"), + accept_downloads=self.accept_downloads, + ignore_https_errors=self.ignore_https_errors, + java_script_enabled=self.java_script_enabled + ) + await self.setup_context(self.default_context) + else: + browser_args = { + "headless": self.headless, + "args": [ + "--no-sandbox", + "--disable-dev-shm-usage", + "--no-first-run", + "--no-default-browser-check", + "--disable-infobars", + "--window-position=0,0", + "--ignore-certificate-errors", + "--ignore-certificate-errors-spki-list", + "--disable-blink-features=AutomationControlled", + "--window-position=400,0", + f"--window-size={self.viewport_width},{self.viewport_height}", + ] + } + + if self.light_mode: + browser_args["args"].extend(BROWSER_DISABLE_OPTIONS) + + if self.text_only: + browser_args["args"].extend(['--blink-settings=imagesEnabled=false','--disable-remote-fonts']) + + if self.chrome_channel: + browser_args["channel"] = self.chrome_channel + + if self.extra_args: + browser_args["args"].extend(self.extra_args) + + if self.accept_downloads: + browser_args["downloads_path"] = os.path.join(os.getcwd(), "downloads") + os.makedirs(browser_args["downloads_path"], exist_ok=True) + + if self.proxy: + from playwright.async_api import ProxySettings + proxy_settings = ProxySettings(server=self.proxy) + browser_args["proxy"] = proxy_settings + elif self.proxy_config: + from playwright.async_api import ProxySettings + proxy_settings = ProxySettings( + server=self.proxy_config.get("server"), + username=self.proxy_config.get("username"), + password=self.proxy_config.get("password") + ) + browser_args["proxy"] = proxy_settings + + if self.browser_type == "firefox": + self.browser = await self.playwright.firefox.launch(**browser_args) + elif self.browser_type == "webkit": + self.browser = await self.playwright.webkit.launch(**browser_args) + else: + self.browser = await self.playwright.chromium.launch(**browser_args) + + self.default_context = self.browser + # Since default_context in non-managed mode is the browser, no setup needed here. + + + async def setup_context(self, context : BrowserContext, is_default=False): + # Set extra headers + if self.headers: + await context.set_extra_http_headers(self.headers) + + # Add cookies if any + if self.cookies: + await context.add_cookies(self.cookies) + + # Ensure storage_state if provided + if self.storage_state: + # If storage_state is a dictionary or file path, Playwright will handle it. + await context.storage_state(path=None) + + # If accept_downloads, set timeouts and ensure properties + if self.accept_downloads: + await context.set_default_timeout(60000) + await context.set_default_navigation_timeout(60000) + if self.downloads_path: + context._impl_obj._options["accept_downloads"] = True + context._impl_obj._options["downloads_path"] = self.downloads_path + + # If we have a user_agent, override it along with sec-ch-ua + if self.user_agent: + # Merge headers if needed + combined_headers = {"User-Agent": self.user_agent, "sec-ch-ua": self.browser_hint} + combined_headers.update(self.headers) + await context.set_extra_http_headers(combined_headers) + + async def close(self): + # Close all active sessions + session_ids = list(self.sessions.keys()) + for session_id in session_ids: + await self.kill_session(session_id) + + if self.browser: + await self.browser.close() + self.browser = None + + if self.managed_browser: + await asyncio.sleep(0.5) + await self.managed_browser.cleanup() + self.managed_browser = None + + if self.playwright: + await self.playwright.stop() + self.playwright = None + + async def get_page(self, session_id: Optional[str], user_agent: str): + # Cleanup expired sessions + self._cleanup_expired_sessions() + + if session_id: + context, page, _ = self.sessions.get(session_id, (None, None, None)) + if context and page: + self.sessions[session_id] = (context, page, time.time()) + return page, context + + # Create a new context/page pair + if self.use_managed_browser: + context = self.default_context + page = await context.new_page() + else: + context = await self.browser.new_context( + user_agent=user_agent, + viewport={"width": self.viewport_width, "height": self.viewport_height}, + proxy={"server": self.proxy} if self.proxy else None, + accept_downloads=self.accept_downloads, + storage_state=self.storage_state, + ignore_https_errors=self.ignore_https_errors + ) + await self.setup_context(context) + page = await context.new_page() + + if session_id: + self.sessions[session_id] = (context, page, time.time()) + + return page, context + + async def kill_session(self, session_id: str): + if session_id in self.sessions: + context, page, _ = self.sessions[session_id] + await page.close() + if not self.use_managed_browser: + await context.close() + del self.sessions[session_id] + + def _cleanup_expired_sessions(self): + current_time = time.time() + expired_sessions = [ + sid for sid, (_, _, last_used) in self.sessions.items() + if current_time - last_used > self.session_ttl + ] + for sid in expired_sessions: + asyncio.create_task(self.kill_session(sid)) + class AsyncCrawlerStrategy(ABC): @abstractmethod async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: - pass + pass # 4 + 3 @abstractmethod async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: @@ -265,6 +500,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.use_managed_browser = kwargs.get("use_managed_browser", False) self.user_data_dir = kwargs.get("user_data_dir", None) self.use_persistent_context = kwargs.get("use_persistent_context", False) + if self.use_persistent_context: + self.use_managed_browser = True self.chrome_channel = kwargs.get("chrome_channel", "chrome") self.managed_browser = None self.default_context = None @@ -278,13 +515,39 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): 'before_retrieve_html': None } self.extra_args = kwargs.get("extra_args", []) + self.ignore_https_errors = kwargs.get("ignore_https_errors", True) + self.java_script_enabled = kwargs.get("java_script_enabled", True) self.accept_downloads = kwargs.get("accept_downloads", False) self.downloads_path = kwargs.get("downloads_path") self._downloaded_files = [] # Track downloaded files for current crawl if self.accept_downloads and not self.downloads_path: self.downloads_path = os.path.join(os.getcwd(), "downloads") os.makedirs(self.downloads_path, exist_ok=True) - + + self.browser_manager = BrowserManager( + use_managed_browser=self.use_managed_browser, + user_data_dir=self.user_data_dir, + headless=self.headless, + logger=self.logger, + browser_type=self.browser_type, + proxy=self.proxy, + proxy_config=self.proxy_config, + chrome_channel=self.chrome_channel, + viewport_width=self.viewport_width, + viewport_height=self.viewport_height, + accept_downloads=self.accept_downloads, + storage_state=self.storage_state, + ignore_https_errors=self.ignore_https_errors, + java_script_enabled=self.java_script_enabled, + cookies=self.cookies, + headers=self.headers, + extra_args=self.extra_args, + text_only=self.text_only, + light_mode=self.light_mode, + user_agent=self.user_agent, + browser_hint=self.browser_hint, + downloads_path=self.downloads_path + ) async def __aenter__(self): await self.start() @@ -294,183 +557,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.close() async def start(self): - if self.playwright is None: - self.playwright = await async_playwright().start() - if self.browser is None: - if self.use_managed_browser: - # Use managed browser approach - self.managed_browser = ManagedBrowser( - browser_type=self.browser_type, - user_data_dir=self.user_data_dir, - headless=self.headless, - logger=self.logger - ) - cdp_url = await self.managed_browser.start() - self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) - - # Get the default context that maintains the user profile - contexts = self.browser.contexts - if contexts: - self.default_context = contexts[0] - else: - # If no default context exists, create one - self.default_context = await self.browser.new_context( - # viewport={"width": 1920, "height": 1080} - viewport={"width": self.viewport_width, "height": self.viewport_height}, - storage_state=self.storage_state, - ) - - # Set up the default context - if self.default_context: - await self.default_context.set_extra_http_headers(self.headers) - if self.cookies: - await self.default_context.add_cookies(self.cookies) - if self.storage_state: - # If storage_state is a dictionary or file path, Playwright will handle it. - await self.default_context.storage_state(path=None) # Just ensuring default_context is ready - if self.accept_downloads: - await self.default_context.set_default_timeout(60000) - await self.default_context.set_default_navigation_timeout(60000) - self.default_context._impl_obj._options["accept_downloads"] = True - self.default_context._impl_obj._options["downloads_path"] = self.downloads_path - - if self.user_agent: - await self.default_context.set_extra_http_headers({ - "User-Agent": self.user_agent, - "sec-ch-ua": self.browser_hint, - # **self.headers - }) - else: - # Base browser arguments - browser_args = { - "headless": self.headless, - "args": [ - "--no-sandbox", - "--disable-dev-shm-usage", - "--no-first-run", - "--no-default-browser-check", - "--disable-infobars", - "--window-position=0,0", - "--ignore-certificate-errors", - "--ignore-certificate-errors-spki-list", - "--disable-blink-features=AutomationControlled", - "--window-position=400,0", - f"--window-size={self.viewport_width},{self.viewport_height}", - ] - } - - if self.light_mode: - browser_args["args"].extend([ - # "--disable-background-networking", - "--disable-background-timer-throttling", - "--disable-backgrounding-occluded-windows", - "--disable-breakpad", - "--disable-client-side-phishing-detection", - "--disable-component-extensions-with-background-pages", - "--disable-default-apps", - "--disable-extensions", - "--disable-features=TranslateUI", - "--disable-hang-monitor", - "--disable-ipc-flooding-protection", - "--disable-popup-blocking", - "--disable-prompt-on-repost", - "--disable-sync", - "--force-color-profile=srgb", - "--metrics-recording-only", - "--no-first-run", - "--password-store=basic", - "--use-mock-keychain" - ]) - - if self.text_only: - browser_args["args"].extend([ - '--blink-settings=imagesEnabled=false', - '--disable-remote-fonts' - ]) - - # Add channel if specified (try Chrome first) - if self.chrome_channel: - browser_args["channel"] = self.chrome_channel - - # Add extra args if provided - if self.extra_args: - browser_args["args"].extend(self.extra_args) - - # Add downloads path if downloads are enabled - if self.accept_downloads: - browser_args["downloads_path"] = self.downloads_path - - # Add proxy settings if a proxy is specified - if self.proxy: - proxy_settings = ProxySettings(server=self.proxy) - browser_args["proxy"] = proxy_settings - elif self.proxy_config: - proxy_settings = ProxySettings( - server=self.proxy_config.get("server"), - username=self.proxy_config.get("username"), - password=self.proxy_config.get("password") - ) - browser_args["proxy"] = proxy_settings - - try: - # Select the appropriate browser based on the browser_type - if self.browser_type == "firefox": - self.browser = await self.playwright.firefox.launch(**browser_args) - elif self.browser_type == "webkit": - if "viewport" not in browser_args: - browser_args["viewport"] = {"width": self.viewport_width, "height": self.viewport_height} - self.browser = await self.playwright.webkit.launch(**browser_args) - else: - if self.use_persistent_context and self.user_data_dir: - self.browser = await self.playwright.chromium.launch_persistent_context( - user_data_dir=self.user_data_dir, - accept_downloads=self.accept_downloads, - downloads_path=self.downloads_path if self.accept_downloads else None, - **browser_args - ) - self.default_context = self.browser - else: - self.browser = await self.playwright.chromium.launch(**browser_args) - self.default_context = self.browser - - except Exception as e: - # Fallback to chromium if Chrome channel fails - if "chrome" in str(e) and browser_args.get("channel") == "chrome": - browser_args["channel"] = "chromium" - if self.use_persistent_context and self.user_data_dir: - self.browser = await self.playwright.chromium.launch_persistent_context( - user_data_dir=self.user_data_dir, - **browser_args - ) - self.default_context = self.browser - else: - self.browser = await self.playwright.chromium.launch(**browser_args) - else: - raise - - await self.execute_hook('on_browser_created', self.browser) - + await self.browser_manager.start() + await self.execute_hook('on_browser_created', self.browser_manager.browser, context = self.browser_manager.default_context) + async def close(self): if self.sleep_on_close: await asyncio.sleep(0.5) - # Close all active sessions - session_ids = list(self.sessions.keys()) - for session_id in session_ids: - await self.kill_session(session_id) - - if self.browser: - await self.browser.close() - self.browser = None - - if self.managed_browser: - await asyncio.sleep(0.5) - await self.managed_browser.cleanup() - self.managed_browser = None - - if self.playwright: - await self.playwright.stop() - self.playwright = None + await self.browser_manager.close() # Issue #256: Remove __del__ method to avoid potential issues with async cleanup # def __del__(self): @@ -631,35 +725,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): async def create_session(self, **kwargs) -> str: """Creates a new browser session and returns its ID.""" - if not self.browser: - await self.start() - + await self.start() + session_id = kwargs.get('session_id') or str(uuid.uuid4()) - if self.use_managed_browser: - page = await self.default_context.new_page() - self.sessions[session_id] = (self.default_context, page, time.time()) - else: - if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: - context = self.browser - page = await context.new_page() - else: - context = await self.browser.new_context( - user_agent=kwargs.get("user_agent", self.user_agent), - viewport={"width": self.viewport_width, "height": self.viewport_height}, - proxy={"server": self.proxy} if self.proxy else None, - accept_downloads=self.accept_downloads, - storage_state=self.storage_state, - ignore_https_errors=True - ) - - if self.cookies: - await context.add_cookies(self.cookies) - await context.set_extra_http_headers(self.headers) - page = await context.new_page() - - self.sessions[session_id] = (context, page, time.time()) - + user_agent = kwargs.get("user_agent", self.user_agent) + # Use browser_manager to get a fresh page & context assigned to this session_id + page, context = await self.browser_manager.get_page(session_id, user_agent) return session_id async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: @@ -720,18 +792,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): else: raise ValueError("URL must start with 'http://', 'https://', 'file://', or 'raw:'") - async def _crawl_web(self, url: str, **kwargs) -> AsyncCrawlResponse: - """ - Existing web crawling logic remains unchanged. - - Args: - url (str): The web URL to crawl. - **kwargs: Additional parameters. - - Returns: - AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot. - """ response_headers = {} status_code = None @@ -751,97 +812,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): ) # Handle page creation differently for managed browser - context = None - if self.use_managed_browser: - if session_id: - # Reuse existing session if available - context, page, _ = self.sessions.get(session_id, (None, None, None)) - if not page: - # Create new page in default context if session doesn't exist - page = await self.default_context.new_page() - self.sessions[session_id] = (self.default_context, page, time.time()) - else: - # Create new page in default context for non-session requests - page = await self.default_context.new_page() - else: - if session_id: - context, page, _ = self.sessions.get(session_id, (None, None, None)) - if not context: - if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: - # In persistent context, browser is the context - context = self.browser - else: - # Normal context creation for non-persistent or non-Chrome browsers - context = await self.browser.new_context( - user_agent=user_agent, - viewport={"width": self.viewport_width, "height": self.viewport_height}, - proxy={"server": self.proxy} if self.proxy else None, - java_script_enabled=True, - accept_downloads=self.accept_downloads, - storage_state=self.storage_state, - # downloads_path=self.downloads_path if self.accept_downloads else None - ) - await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) - if self.cookies: - await context.add_cookies(self.cookies) - await context.set_extra_http_headers(self.headers) - - page = await context.new_page() - self.sessions[session_id] = (context, page, time.time()) - else: - if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: - # In persistent context, browser is the context - context = self.browser - else: - # Normal context creation - context = await self.browser.new_context( - user_agent=user_agent, - # viewport={"width": 1920, "height": 1080}, - viewport={"width": self.viewport_width, "height": self.viewport_height}, - proxy={"server": self.proxy} if self.proxy else None, - accept_downloads=self.accept_downloads, - storage_state=self.storage_state, - ignore_https_errors=True # Add this line - ) - if self.cookies: - await context.add_cookies(self.cookies) - await context.set_extra_http_headers(self.headers) - - if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False): - # Inject scripts to override navigator properties - await context.add_init_script(""" - // Pass the Permissions Test. - const originalQuery = window.navigator.permissions.query; - window.navigator.permissions.query = (parameters) => ( - parameters.name === 'notifications' ? - Promise.resolve({ state: Notification.permission }) : - originalQuery(parameters) - ); - Object.defineProperty(navigator, 'webdriver', { - get: () => undefined - }); - window.navigator.chrome = { - runtime: {}, - // Add other properties if necessary - }; - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5], - }); - Object.defineProperty(navigator, 'languages', { - get: () => ['en-US', 'en'], - }); - Object.defineProperty(document, 'hidden', { - get: () => false - }); - Object.defineProperty(document, 'visibilityState', { - get: () => 'visible' - }); - """) - - page = await context.new_page() - if kwargs.get("magic", False): - await stealth_async(page, stealth_config) - + page, context = await self.browser_manager.get_page(session_id, user_agent) + await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) + + if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False): + # Inject scripts to override navigator properties + await context.add_init_script(load_js_script("navigator_overrider")) + # Add console message and error logging if kwargs.get("log_console", False): page.on("console", lambda msg: print(f"Console: {msg.text}")) @@ -1052,62 +1029,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Update image dimensions if not self.text_only: - update_image_dimensions_js = """ - () => { - return new Promise((resolve) => { - const filterImage = (img) => { - // Filter out images that are too small - if (img.width < 100 && img.height < 100) return false; - - // Filter out images that are not visible - const rect = img.getBoundingClientRect(); - if (rect.width === 0 || rect.height === 0) return false; - - // Filter out images with certain class names (e.g., icons, thumbnails) - if (img.classList.contains('icon') || img.classList.contains('thumbnail')) return false; - - // Filter out images with certain patterns in their src (e.g., placeholder images) - if (img.src.includes('placeholder') || img.src.includes('icon')) return false; - - return true; - }; - - const images = Array.from(document.querySelectorAll('img')).filter(filterImage); - let imagesLeft = images.length; - - if (imagesLeft === 0) { - resolve(); - return; - } - - const checkImage = (img) => { - if (img.complete && img.naturalWidth !== 0) { - img.setAttribute('width', img.naturalWidth); - img.setAttribute('height', img.naturalHeight); - imagesLeft--; - if (imagesLeft === 0) resolve(); - } - }; - - images.forEach(img => { - checkImage(img); - if (!img.complete) { - img.onload = () => { - checkImage(img); - }; - img.onerror = () => { - imagesLeft--; - if (imagesLeft === 0) resolve(); - }; - } - }); - - // Fallback timeout of 5 seconds - // setTimeout(() => resolve(), 5000); - resolve(); - }); - } - """ + update_image_dimensions_js = load_js_script("update_image_dimensions") try: try: @@ -1245,124 +1167,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): Args: page (Page): The Playwright page instance """ - remove_overlays_js = """ - async () => { - // Function to check if element is visible - const isVisible = (elem) => { - const style = window.getComputedStyle(elem); - return style.display !== 'none' && - style.visibility !== 'hidden' && - style.opacity !== '0'; - }; - - // Common selectors for popups and overlays - const commonSelectors = [ - // Close buttons first - 'button[class*="close" i]', 'button[class*="dismiss" i]', - 'button[aria-label*="close" i]', 'button[title*="close" i]', - 'a[class*="close" i]', 'span[class*="close" i]', - - // Cookie notices - '[class*="cookie-banner" i]', '[id*="cookie-banner" i]', - '[class*="cookie-consent" i]', '[id*="cookie-consent" i]', - - // Newsletter/subscription dialogs - '[class*="newsletter" i]', '[class*="subscribe" i]', - - // Generic popups/modals - '[class*="popup" i]', '[class*="modal" i]', - '[class*="overlay" i]', '[class*="dialog" i]', - '[role="dialog"]', '[role="alertdialog"]' - ]; - - // Try to click close buttons first - for (const selector of commonSelectors.slice(0, 6)) { - const closeButtons = document.querySelectorAll(selector); - for (const button of closeButtons) { - if (isVisible(button)) { - try { - button.click(); - await new Promise(resolve => setTimeout(resolve, 100)); - } catch (e) { - console.log('Error clicking button:', e); - } - } - } - } - - // Remove remaining overlay elements - const removeOverlays = () => { - // Find elements with high z-index - const allElements = document.querySelectorAll('*'); - for (const elem of allElements) { - const style = window.getComputedStyle(elem); - const zIndex = parseInt(style.zIndex); - const position = style.position; - - if ( - isVisible(elem) && - (zIndex > 999 || position === 'fixed' || position === 'absolute') && - ( - elem.offsetWidth > window.innerWidth * 0.5 || - elem.offsetHeight > window.innerHeight * 0.5 || - style.backgroundColor.includes('rgba') || - parseFloat(style.opacity) < 1 - ) - ) { - elem.remove(); - } - } - - // Remove elements matching common selectors - for (const selector of commonSelectors) { - const elements = document.querySelectorAll(selector); - elements.forEach(elem => { - if (isVisible(elem)) { - elem.remove(); - } - }); - } - }; - - // Remove overlay elements - removeOverlays(); - - // Remove any fixed/sticky position elements at the top/bottom - const removeFixedElements = () => { - const elements = document.querySelectorAll('*'); - elements.forEach(elem => { - const style = window.getComputedStyle(elem); - if ( - (style.position === 'fixed' || style.position === 'sticky') && - isVisible(elem) - ) { - elem.remove(); - } - }); - }; - - removeFixedElements(); - - // Remove empty block elements as: div, p, span, etc. - const removeEmptyBlockElements = () => { - const blockElements = document.querySelectorAll('div, p, span, section, article, header, footer, aside, nav, main, ul, ol, li, dl, dt, dd, h1, h2, h3, h4, h5, h6'); - blockElements.forEach(elem => { - if (elem.innerText.trim() === '') { - elem.remove(); - } - }); - }; - - // Remove margin-right and padding-right from body (often added by modal scripts) - document.body.style.marginRight = '0px'; - document.body.style.paddingRight = '0px'; - document.body.style.overflow = 'auto'; - - // Wait a bit for any animations to complete - await new Promise(resolve => setTimeout(resolve, 100)); - } - """ - + remove_overlays_js = load_js_script("remove_overlays") + try: await page.evaluate(remove_overlays_js) await page.wait_for_timeout(500) # Wait for any animations to complete @@ -1440,9 +1246,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): Optional[str]: Base64-encoded screenshot image or an error image if failed. """ try: - if not self.browser: - await self.start() - page = await self.browser.new_page() + await self.start() + # Create a temporary page without a session_id + page, context = await self.browser_manager.get_page(None, self.user_agent) + await page.set_content(html, wait_until='networkidle') screenshot = await page.screenshot(full_page=True) await page.close() diff --git a/crawl4ai/async_tools.py b/crawl4ai/async_tools.py new file mode 100644 index 00000000..157e5596 --- /dev/null +++ b/crawl4ai/async_tools.py @@ -0,0 +1,183 @@ +import asyncio +import base64 +import time +from abc import ABC, abstractmethod +from typing import Callable, Dict, Any, List, Optional, Awaitable +import os, sys, shutil +import tempfile, subprocess +from playwright.async_api import async_playwright, Page, Browser, Error +from playwright.async_api import TimeoutError as PlaywrightTimeoutError +from io import BytesIO +from PIL import Image, ImageDraw, ImageFont +from pathlib import Path +from playwright.async_api import ProxySettings +from pydantic import BaseModel +import hashlib +import json +import uuid +from .models import AsyncCrawlResponse +from .utils import create_box_message +from .user_agent_generator import UserAgentGenerator +from playwright_stealth import StealthConfig, stealth_async + + +class ManagedBrowser: + def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None, host: str = "localhost", debugging_port: int = 9222): + self.browser_type = browser_type + self.user_data_dir = user_data_dir + self.headless = headless + self.browser_process = None + self.temp_dir = None + self.debugging_port = debugging_port + self.host = host + self.logger = logger + self.shutting_down = False + + async def start(self) -> str: + """ + Starts the browser process and returns the CDP endpoint URL. + If user_data_dir is not provided, creates a temporary directory. + """ + + # Create temp dir if needed + if not self.user_data_dir: + self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-") + self.user_data_dir = self.temp_dir + + # Get browser path and args based on OS and browser type + browser_path = self._get_browser_path() + args = self._get_browser_args() + + # Start browser process + try: + self.browser_process = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + # Monitor browser process output for errors + asyncio.create_task(self._monitor_browser_process()) + await asyncio.sleep(2) # Give browser time to start + return f"http://{self.host}:{self.debugging_port}" + except Exception as e: + await self.cleanup() + raise Exception(f"Failed to start browser: {e}") + + async def _monitor_browser_process(self): + """Monitor the browser process for unexpected termination.""" + if self.browser_process: + try: + stdout, stderr = await asyncio.gather( + asyncio.to_thread(self.browser_process.stdout.read), + asyncio.to_thread(self.browser_process.stderr.read) + ) + + # Check shutting_down flag BEFORE logging anything + if self.browser_process.poll() is not None: + if not self.shutting_down: + self.logger.error( + message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", + tag="ERROR", + params={ + "code": self.browser_process.returncode, + "stdout": stdout.decode(), + "stderr": stderr.decode() + } + ) + await self.cleanup() + else: + self.logger.info( + message="Browser process terminated normally | Code: {code}", + tag="INFO", + params={"code": self.browser_process.returncode} + ) + except Exception as e: + if not self.shutting_down: + self.logger.error( + message="Error monitoring browser process: {error}", + tag="ERROR", + params={"error": str(e)} + ) + + def _get_browser_path(self) -> str: + """Returns the browser executable path based on OS and browser type""" + if sys.platform == "darwin": # macOS + paths = { + "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox", + "webkit": "/Applications/Safari.app/Contents/MacOS/Safari" + } + elif sys.platform == "win32": # Windows + paths = { + "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe", + "webkit": None # WebKit not supported on Windows + } + else: # Linux + paths = { + "chromium": "google-chrome", + "firefox": "firefox", + "webkit": None # WebKit not supported on Linux + } + + return paths.get(self.browser_type) + + def _get_browser_args(self) -> List[str]: + """Returns browser-specific command line arguments""" + base_args = [self._get_browser_path()] + + if self.browser_type == "chromium": + args = [ + f"--remote-debugging-port={self.debugging_port}", + f"--user-data-dir={self.user_data_dir}", + ] + if self.headless: + args.append("--headless=new") + elif self.browser_type == "firefox": + args = [ + "--remote-debugging-port", str(self.debugging_port), + "--profile", self.user_data_dir, + ] + if self.headless: + args.append("--headless") + else: + raise NotImplementedError(f"Browser type {self.browser_type} not supported") + + return base_args + args + + async def cleanup(self): + """Cleanup browser process and temporary directory""" + # Set shutting_down flag BEFORE any termination actions + self.shutting_down = True + + if self.browser_process: + try: + self.browser_process.terminate() + # Wait for process to end gracefully + for _ in range(10): # 10 attempts, 100ms each + if self.browser_process.poll() is not None: + break + await asyncio.sleep(0.1) + + # Force kill if still running + if self.browser_process.poll() is None: + self.browser_process.kill() + await asyncio.sleep(0.1) # Brief wait for kill to take effect + + except Exception as e: + self.logger.error( + message="Error terminating browser: {error}", + tag="ERROR", + params={"error": str(e)} + ) + + if self.temp_dir and os.path.exists(self.temp_dir): + try: + shutil.rmtree(self.temp_dir) + except Exception as e: + self.logger.error( + message="Error removing temporary directory: {error}", + tag="ERROR", + params={"error": str(e)} + ) + diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index b872c20c..1a4b1333 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -7,7 +7,7 @@ from pathlib import Path from typing import Optional, List, Union import json import asyncio -from contextlib import nullcontext +from contextlib import nullcontext, asynccontextmanager from .models import CrawlResult, MarkdownGenerationResult from .async_database import async_db_manager from .chunking_strategy import * @@ -122,15 +122,14 @@ class AsyncWebCrawler: async def __aexit__(self, exc_type, exc_val, exc_tb): await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb) + @asynccontextmanager + async def nullcontext(self): + yield + async def awarmup(self): """Initialize the crawler with warm-up sequence.""" self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") - # if self.verbose: - # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Crawl4AI {crawl4ai_version}{Style.RESET_ALL}") - # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Warming up AsyncWebCrawler{Style.RESET_ALL}") self.ready = True - # if self.verbose: - # print(f"{Fore.GREEN}{self.tag_format('READY')} {self.log_icons['READY']} AsyncWebCrawler initialized{Style.RESET_ALL}") async def arun( self, @@ -186,7 +185,7 @@ class AsyncWebCrawler: if not isinstance(url, str) or not url: raise ValueError("Invalid URL, make sure the URL is a non-empty string") - async with self._lock or nullcontext(): + async with self._lock or self.nullcontext(): # Lock for thread safety previously -> nullcontext(): try: # Handle deprecated parameters if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 970c40f0..f58e1eac 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -14,15 +14,11 @@ from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator from .models import MarkdownGenerationResult from .utils import ( - sanitize_input_encode, - sanitize_html, extract_metadata, - InvalidCSSSelectorError, - CustomHTML2Text, normalize_url, is_external_url ) -from .tools import profile_and_time + # Pre-compile regular expressions for Open Graph and Twitter metadata OG_REGEX = re.compile(r'^og:') @@ -76,10 +72,10 @@ class WebScrapingStrategy(ContentScrapingStrategy): log_method(message=message, tag=tag, **kwargs) def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: - return self._get_content_of_website_optimized(url, html, is_async=False, **kwargs) + return self._scrap(url, html, is_async=False, **kwargs) async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: - return await asyncio.to_thread(self._get_content_of_website_optimized, url, html, **kwargs) + return await asyncio.to_thread(self._scrap, url, html, **kwargs) def _generate_markdown_content(self, cleaned_html: str, @@ -103,8 +99,6 @@ class WebScrapingStrategy(ContentScrapingStrategy): html2text_options=kwargs.get('html2text', {}) ) - help_message = """""" - return { 'markdown': markdown_result.raw_markdown, 'fit_markdown': markdown_result.fit_markdown, @@ -126,38 +120,40 @@ class WebScrapingStrategy(ContentScrapingStrategy): } # Legacy method - h = CustomHTML2Text() - h.update_params(**kwargs.get('html2text', {})) - markdown = h.handle(cleaned_html) - markdown = markdown.replace(' ```', '```') + """ + # h = CustomHTML2Text() + # h.update_params(**kwargs.get('html2text', {})) + # markdown = h.handle(cleaned_html) + # markdown = markdown.replace(' ```', '```') - fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content." - fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content." + # fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content." + # fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content." - if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False): - content_filter = kwargs.get('content_filter', None) - if not content_filter: - content_filter = BM25ContentFilter( - user_query=kwargs.get('fit_markdown_user_query', None), - bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) - ) - fit_html = content_filter.filter_content(html) - fit_html = '\n'.join('
{}
'.format(s) for s in fit_html) - fit_markdown = h.handle(fit_html) + # if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False): + # content_filter = kwargs.get('content_filter', None) + # if not content_filter: + # content_filter = BM25ContentFilter( + # user_query=kwargs.get('fit_markdown_user_query', None), + # bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) + # ) + # fit_html = content_filter.filter_content(html) + # fit_html = '\n'.join('
{}
'.format(s) for s in fit_html) + # fit_markdown = h.handle(fit_html) - markdown_v2 = MarkdownGenerationResult( - raw_markdown=markdown, - markdown_with_citations=markdown, - references_markdown=markdown, - fit_markdown=fit_markdown - ) + # markdown_v2 = MarkdownGenerationResult( + # raw_markdown=markdown, + # markdown_with_citations=markdown, + # references_markdown=markdown, + # fit_markdown=fit_markdown + # ) - return { - 'markdown': markdown, - 'fit_markdown': fit_markdown, - 'fit_html': fit_html, - 'markdown_v2' : markdown_v2 - } + # return { + # 'markdown': markdown, + # 'fit_markdown': fit_markdown, + # 'fit_html': fit_html, + # 'markdown_v2' : markdown_v2 + # } + """ def flatten_nested_elements(self, node): if isinstance(node, NavigableString): @@ -483,7 +479,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): ) return False - def _get_content_of_website_optimized(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]: + def _scrap(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]: success = True if not html: return None diff --git a/crawl4ai/html2text/__init__.py b/crawl4ai/html2text/__init__.py index c1effe6b..c41258e0 100644 --- a/crawl4ai/html2text/__init__.py +++ b/crawl4ai/html2text/__init__.py @@ -1006,10 +1006,136 @@ class HTML2Text(html.parser.HTMLParser): newlines += 1 return result - def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) -> str: if bodywidth is None: bodywidth = config.BODY_WIDTH h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth) return h.handle(html) + +class CustomHTML2Text(HTML2Text): + def __init__(self, *args, handle_code_in_pre=False, **kwargs): + super().__init__(*args, **kwargs) + self.inside_pre = False + self.inside_code = False + self.preserve_tags = set() # Set of tags to preserve + self.current_preserved_tag = None + self.preserved_content = [] + self.preserve_depth = 0 + self.handle_code_in_pre = handle_code_in_pre + + # Configuration options + self.skip_internal_links = False + self.single_line_break = False + self.mark_code = False + self.include_sup_sub = False + self.body_width = 0 + self.ignore_mailto_links = True + self.ignore_links = False + self.escape_backslash = False + self.escape_dot = False + self.escape_plus = False + self.escape_dash = False + self.escape_snob = False + + def update_params(self, **kwargs): + """Update parameters and set preserved tags.""" + for key, value in kwargs.items(): + if key == 'preserve_tags': + self.preserve_tags = set(value) + elif key == 'handle_code_in_pre': + self.handle_code_in_pre = value + else: + setattr(self, key, value) + + def handle_tag(self, tag, attrs, start): + # Handle preserved tags + if tag in self.preserve_tags: + if start: + if self.preserve_depth == 0: + self.current_preserved_tag = tag + self.preserved_content = [] + # Format opening tag with attributes + attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None) + self.preserved_content.append(f'<{tag}{attr_str}>') + self.preserve_depth += 1 + return + else: + self.preserve_depth -= 1 + if self.preserve_depth == 0: + self.preserved_content.append(f'') + # Output the preserved HTML block with proper spacing + preserved_html = ''.join(self.preserved_content) + self.o('\n' + preserved_html + '\n') + self.current_preserved_tag = None + return + + # If we're inside a preserved tag, collect all content + if self.preserve_depth > 0: + if start: + # Format nested tags with attributes + attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None) + self.preserved_content.append(f'<{tag}{attr_str}>') + else: + self.preserved_content.append(f'') + return + + # Handle pre tags + if tag == 'pre': + if start: + self.o('```\n') # Markdown code block start + self.inside_pre = True + else: + self.o('\n```\n') # Markdown code block end + self.inside_pre = False + elif tag == 'code': + if self.inside_pre and not self.handle_code_in_pre: + # Ignore code tags inside pre blocks if handle_code_in_pre is False + return + if start: + self.o('`') # Markdown inline code start + self.inside_code = True + else: + self.o('`') # Markdown inline code end + self.inside_code = False + else: + super().handle_tag(tag, attrs, start) + + def handle_data(self, data, entity_char=False): + """Override handle_data to capture content within preserved tags.""" + if self.preserve_depth > 0: + self.preserved_content.append(data) + return + + if self.inside_pre: + # Output the raw content for pre blocks, including content inside code tags + self.o(data) # Directly output the data as-is (preserve newlines) + return + if self.inside_code: + # Inline code: no newlines allowed + self.o(data.replace('\n', ' ')) + return + + # Default behavior for other tags + super().handle_data(data, entity_char) + + + # # Handle pre tags + # if tag == 'pre': + # if start: + # self.o('```\n') + # self.inside_pre = True + # else: + # self.o('\n```') + # self.inside_pre = False + # # elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: + # # pass + # else: + # super().handle_tag(tag, attrs, start) + + # def handle_data(self, data, entity_char=False): + # """Override handle_data to capture content within preserved tags.""" + # if self.preserve_depth > 0: + # self.preserved_content.append(data) + # return + # super().handle_data(data, entity_char) diff --git a/crawl4ai/js_snippet/__init__.py b/crawl4ai/js_snippet/__init__.py new file mode 100644 index 00000000..73b0c2dd --- /dev/null +++ b/crawl4ai/js_snippet/__init__.py @@ -0,0 +1,15 @@ +import os, sys + +# Create a function get name of a js script, then load from the CURRENT folder of this script and return its content as string, make sure its error free +def load_js_script(script_name): + # Get the path of the current script + current_script_path = os.path.dirname(os.path.realpath(__file__)) + # Get the path of the script to load + script_path = os.path.join(current_script_path, script_name + '.js') + # Check if the script exists + if not os.path.exists(script_path): + raise ValueError(f"Script {script_name} not found in the folder {current_script_path}") + # Load the content of the script + with open(script_path, 'r') as f: + script_content = f.read() + return script_content diff --git a/crawl4ai/js_snippet/navigator_overrider.js b/crawl4ai/js_snippet/navigator_overrider.js new file mode 100644 index 00000000..f341ceeb --- /dev/null +++ b/crawl4ai/js_snippet/navigator_overrider.js @@ -0,0 +1,25 @@ +// Pass the Permissions Test. +const originalQuery = window.navigator.permissions.query; +window.navigator.permissions.query = (parameters) => + parameters.name === "notifications" + ? Promise.resolve({ state: Notification.permission }) + : originalQuery(parameters); +Object.defineProperty(navigator, "webdriver", { + get: () => undefined, +}); +window.navigator.chrome = { + runtime: {}, + // Add other properties if necessary +}; +Object.defineProperty(navigator, "plugins", { + get: () => [1, 2, 3, 4, 5], +}); +Object.defineProperty(navigator, "languages", { + get: () => ["en-US", "en"], +}); +Object.defineProperty(document, "hidden", { + get: () => false, +}); +Object.defineProperty(document, "visibilityState", { + get: () => "visible", +}); diff --git a/crawl4ai/js_snippet/remove_overlay_elements.js b/crawl4ai/js_snippet/remove_overlay_elements.js new file mode 100644 index 00000000..0400d89c --- /dev/null +++ b/crawl4ai/js_snippet/remove_overlay_elements.js @@ -0,0 +1,119 @@ +async () => { + // Function to check if element is visible + const isVisible = (elem) => { + const style = window.getComputedStyle(elem); + return style.display !== "none" && style.visibility !== "hidden" && style.opacity !== "0"; + }; + + // Common selectors for popups and overlays + const commonSelectors = [ + // Close buttons first + 'button[class*="close" i]', + 'button[class*="dismiss" i]', + 'button[aria-label*="close" i]', + 'button[title*="close" i]', + 'a[class*="close" i]', + 'span[class*="close" i]', + + // Cookie notices + '[class*="cookie-banner" i]', + '[id*="cookie-banner" i]', + '[class*="cookie-consent" i]', + '[id*="cookie-consent" i]', + + // Newsletter/subscription dialogs + '[class*="newsletter" i]', + '[class*="subscribe" i]', + + // Generic popups/modals + '[class*="popup" i]', + '[class*="modal" i]', + '[class*="overlay" i]', + '[class*="dialog" i]', + '[role="dialog"]', + '[role="alertdialog"]', + ]; + + // Try to click close buttons first + for (const selector of commonSelectors.slice(0, 6)) { + const closeButtons = document.querySelectorAll(selector); + for (const button of closeButtons) { + if (isVisible(button)) { + try { + button.click(); + await new Promise((resolve) => setTimeout(resolve, 100)); + } catch (e) { + console.log("Error clicking button:", e); + } + } + } + } + + // Remove remaining overlay elements + const removeOverlays = () => { + // Find elements with high z-index + const allElements = document.querySelectorAll("*"); + for (const elem of allElements) { + const style = window.getComputedStyle(elem); + const zIndex = parseInt(style.zIndex); + const position = style.position; + + if ( + isVisible(elem) && + (zIndex > 999 || position === "fixed" || position === "absolute") && + (elem.offsetWidth > window.innerWidth * 0.5 || + elem.offsetHeight > window.innerHeight * 0.5 || + style.backgroundColor.includes("rgba") || + parseFloat(style.opacity) < 1) + ) { + elem.remove(); + } + } + + // Remove elements matching common selectors + for (const selector of commonSelectors) { + const elements = document.querySelectorAll(selector); + elements.forEach((elem) => { + if (isVisible(elem)) { + elem.remove(); + } + }); + } + }; + + // Remove overlay elements + removeOverlays(); + + // Remove any fixed/sticky position elements at the top/bottom + const removeFixedElements = () => { + const elements = document.querySelectorAll("*"); + elements.forEach((elem) => { + const style = window.getComputedStyle(elem); + if ((style.position === "fixed" || style.position === "sticky") && isVisible(elem)) { + elem.remove(); + } + }); + }; + + removeFixedElements(); + + // Remove empty block elements as: div, p, span, etc. + const removeEmptyBlockElements = () => { + const blockElements = document.querySelectorAll( + "div, p, span, section, article, header, footer, aside, nav, main, ul, ol, li, dl, dt, dd, h1, h2, h3, h4, h5, h6" + ); + blockElements.forEach((elem) => { + if (elem.innerText.trim() === "") { + elem.remove(); + } + }); + }; + + // Remove margin-right and padding-right from body (often added by modal scripts) + document.body.style.marginRight = "0px"; + document.body.style.paddingRight = "0px"; + document.body.style.overflow = "auto"; + + // Wait a bit for any animations to complete + await new Promise((resolve) => setTimeout(resolve, 100)); +}; diff --git a/crawl4ai/js_snippet/update_image_dimensions.js b/crawl4ai/js_snippet/update_image_dimensions.js new file mode 100644 index 00000000..709a35d5 --- /dev/null +++ b/crawl4ai/js_snippet/update_image_dimensions.js @@ -0,0 +1,54 @@ +() => { + return new Promise((resolve) => { + const filterImage = (img) => { + // Filter out images that are too small + if (img.width < 100 && img.height < 100) return false; + + // Filter out images that are not visible + const rect = img.getBoundingClientRect(); + if (rect.width === 0 || rect.height === 0) return false; + + // Filter out images with certain class names (e.g., icons, thumbnails) + if (img.classList.contains("icon") || img.classList.contains("thumbnail")) return false; + + // Filter out images with certain patterns in their src (e.g., placeholder images) + if (img.src.includes("placeholder") || img.src.includes("icon")) return false; + + return true; + }; + + const images = Array.from(document.querySelectorAll("img")).filter(filterImage); + let imagesLeft = images.length; + + if (imagesLeft === 0) { + resolve(); + return; + } + + const checkImage = (img) => { + if (img.complete && img.naturalWidth !== 0) { + img.setAttribute("width", img.naturalWidth); + img.setAttribute("height", img.naturalHeight); + imagesLeft--; + if (imagesLeft === 0) resolve(); + } + }; + + images.forEach((img) => { + checkImage(img); + if (!img.complete) { + img.onload = () => { + checkImage(img); + }; + img.onerror = () => { + imagesLeft--; + if (imagesLeft === 0) resolve(); + }; + } + }); + + // Fallback timeout of 5 seconds + // setTimeout(() => resolve(), 5000); + resolve(); + }); +}; diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index 1e0ca664..b9e4b0c6 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod from typing import Optional, Dict, Any, Tuple from .models import MarkdownGenerationResult -from .utils import CustomHTML2Text +from .html2text import CustomHTML2Text from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter import re from urllib.parse import urljoin @@ -9,6 +9,17 @@ from urllib.parse import urljoin # Pre-compile the regex pattern LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)') +def fast_urljoin(base: str, url: str) -> str: + """Fast URL joining for common cases.""" + if url.startswith(('http://', 'https://', 'mailto:', '//')): + return url + if url.startswith('/'): + # Handle absolute paths + if base.endswith('/'): + return base[:-1] + url + return base + url + return urljoin(base, url) + class MarkdownGenerationStrategy(ABC): """Abstract base class for markdown generation strategies.""" def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None): @@ -118,13 +129,3 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy): fit_html=filtered_html, ) -def fast_urljoin(base: str, url: str) -> str: - """Fast URL joining for common cases.""" - if url.startswith(('http://', 'https://', 'mailto:', '//')): - return url - if url.startswith('/'): - # Handle absolute paths - if base.endswith('/'): - return base[:-1] + url - return base + url - return urljoin(base, url) \ No newline at end of file diff --git a/crawl4ai/tools.py b/crawl4ai/tools.py deleted file mode 100644 index ff36b53a..00000000 --- a/crawl4ai/tools.py +++ /dev/null @@ -1,34 +0,0 @@ -import time -import cProfile -import pstats -from functools import wraps - -def profile_and_time(func): - @wraps(func) - def wrapper(self, *args, **kwargs): - # Start timer - start_time = time.perf_counter() - - # Setup profiler - profiler = cProfile.Profile() - profiler.enable() - - # Run function - result = func(self, *args, **kwargs) - - # Stop profiler - profiler.disable() - - # Calculate elapsed time - elapsed_time = time.perf_counter() - start_time - - # Print timing - print(f"[PROFILER] Scraping completed in {elapsed_time:.2f} seconds") - - # Print profiling stats - stats = pstats.Stats(profiler) - stats.sort_stats('cumulative') # Sort by cumulative time - stats.print_stats(20) # Print top 20 time-consuming functions - - return result - return wrapper \ No newline at end of file diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 879ba562..05a4fbb4 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -19,139 +19,13 @@ from typing import Optional, Tuple, Dict, Any import xxhash from colorama import Fore, Style, init import textwrap - -from .html2text import HTML2Text -class CustomHTML2Text(HTML2Text): - def __init__(self, *args, handle_code_in_pre=False, **kwargs): - super().__init__(*args, **kwargs) - self.inside_pre = False - self.inside_code = False - self.preserve_tags = set() # Set of tags to preserve - self.current_preserved_tag = None - self.preserved_content = [] - self.preserve_depth = 0 - self.handle_code_in_pre = handle_code_in_pre - - # Configuration options - self.skip_internal_links = False - self.single_line_break = False - self.mark_code = False - self.include_sup_sub = False - self.body_width = 0 - self.ignore_mailto_links = True - self.ignore_links = False - self.escape_backslash = False - self.escape_dot = False - self.escape_plus = False - self.escape_dash = False - self.escape_snob = False - - def update_params(self, **kwargs): - """Update parameters and set preserved tags.""" - for key, value in kwargs.items(): - if key == 'preserve_tags': - self.preserve_tags = set(value) - elif key == 'handle_code_in_pre': - self.handle_code_in_pre = value - else: - setattr(self, key, value) - - def handle_tag(self, tag, attrs, start): - # Handle preserved tags - if tag in self.preserve_tags: - if start: - if self.preserve_depth == 0: - self.current_preserved_tag = tag - self.preserved_content = [] - # Format opening tag with attributes - attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None) - self.preserved_content.append(f'<{tag}{attr_str}>') - self.preserve_depth += 1 - return - else: - self.preserve_depth -= 1 - if self.preserve_depth == 0: - self.preserved_content.append(f'') - # Output the preserved HTML block with proper spacing - preserved_html = ''.join(self.preserved_content) - self.o('\n' + preserved_html + '\n') - self.current_preserved_tag = None - return - - # If we're inside a preserved tag, collect all content - if self.preserve_depth > 0: - if start: - # Format nested tags with attributes - attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None) - self.preserved_content.append(f'<{tag}{attr_str}>') - else: - self.preserved_content.append(f'') - return - - # Handle pre tags - if tag == 'pre': - if start: - self.o('```\n') # Markdown code block start - self.inside_pre = True - else: - self.o('\n```\n') # Markdown code block end - self.inside_pre = False - elif tag == 'code': - if self.inside_pre and not self.handle_code_in_pre: - # Ignore code tags inside pre blocks if handle_code_in_pre is False - return - if start: - self.o('`') # Markdown inline code start - self.inside_code = True - else: - self.o('`') # Markdown inline code end - self.inside_code = False - else: - super().handle_tag(tag, attrs, start) - - def handle_data(self, data, entity_char=False): - """Override handle_data to capture content within preserved tags.""" - if self.preserve_depth > 0: - self.preserved_content.append(data) - return - - if self.inside_pre: - # Output the raw content for pre blocks, including content inside code tags - self.o(data) # Directly output the data as-is (preserve newlines) - return - if self.inside_code: - # Inline code: no newlines allowed - self.o(data.replace('\n', ' ')) - return - - # Default behavior for other tags - super().handle_data(data, entity_char) - - - # # Handle pre tags - # if tag == 'pre': - # if start: - # self.o('```\n') - # self.inside_pre = True - # else: - # self.o('\n```') - # self.inside_pre = False - # # elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: - # # pass - # else: - # super().handle_tag(tag, attrs, start) - - # def handle_data(self, data, entity_char=False): - # """Override handle_data to capture content within preserved tags.""" - # if self.preserve_depth > 0: - # self.preserved_content.append(data) - # return - # super().handle_data(data, entity_char) +import cProfile +import pstats +from functools import wraps class InvalidCSSSelectorError(Exception): pass - def create_box_message( message: str, type: str = "info", @@ -374,50 +248,6 @@ def escape_json_string(s): return s -class CustomHTML2Text_v0(HTML2Text): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.inside_pre = False - self.inside_code = False - - self.skip_internal_links = False - self.single_line_break = False - self.mark_code = False - self.include_sup_sub = False - self.body_width = 0 - self.ignore_mailto_links = True - self.ignore_links = False - self.escape_backslash = False - self.escape_dot = False - self.escape_plus = False - self.escape_dash = False - self.escape_snob = False - - - def handle_tag(self, tag, attrs, start): - if tag == 'pre': - if start: - self.o('```\n') - self.inside_pre = True - else: - self.o('\n```') - self.inside_pre = False - elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: - pass - - - # elif tag == 'code' and not self.inside_pre: - # if start: - # if not self.inside_pre: - # self.o('`') - # self.inside_code = True - # else: - # if not self.inside_pre: - # self.o('`') - # self.inside_code = False - - super().handle_tag(tag, attrs, start) - def replace_inline_tags(soup, tags, only_text=False): tag_replacements = { 'b': lambda tag: f"**{tag.text}**", @@ -979,7 +809,6 @@ def extract_metadata(html, soup=None): return metadata - def extract_xml_tags(string): tags = re.findall(r'<(\w+)>', string) return list(set(tags)) @@ -997,7 +826,6 @@ def extract_xml_data(tags, string): return data -# Function to perform the completion with exponential backoff def perform_completion_with_backoff( provider, prompt_with_variables, @@ -1351,6 +1179,35 @@ def clean_tokens(tokens: list[str]) -> list[str]: and not token.startswith('▲') and not token.startswith('⬆')] +def profile_and_time(func): + @wraps(func) + def wrapper(self, *args, **kwargs): + # Start timer + start_time = time.perf_counter() + + # Setup profiler + profiler = cProfile.Profile() + profiler.enable() + + # Run function + result = func(self, *args, **kwargs) + + # Stop profiler + profiler.disable() + + # Calculate elapsed time + elapsed_time = time.perf_counter() - start_time + + # Print timing + print(f"[PROFILER] Scraping completed in {elapsed_time:.2f} seconds") + + # Print profiling stats + stats = pstats.Stats(profiler) + stats.sort_stats('cumulative') # Sort by cumulative time + stats.print_stats(20) # Print top 20 time-consuming functions + + return result + return wrapper def generate_content_hash(content: str) -> str: """Generate a unique hash for content""" diff --git a/docs/examples/storage_state_tutorial.md b/docs/examples/storage_state_tutorial.md new file mode 100644 index 00000000..304e6399 --- /dev/null +++ b/docs/examples/storage_state_tutorial.md @@ -0,0 +1,225 @@ +### Using `storage_state` to Pre-Load Cookies and LocalStorage + +Crawl4ai’s `AsyncWebCrawler` lets you preserve and reuse session data, including cookies and localStorage, across multiple runs. By providing a `storage_state`, you can start your crawls already “logged in” or with any other necessary session data—no need to repeat the login flow every time. + +#### What is `storage_state`? + +`storage_state` can be: + +- A dictionary containing cookies and localStorage data. +- A path to a JSON file that holds this information. + +When you pass `storage_state` to the crawler, it applies these cookies and localStorage entries before loading any pages. This means your crawler effectively starts in a known authenticated or pre-configured state. + +#### Example Structure + +Here’s an example storage state: + +```json +{ + "cookies": [ + { + "name": "session", + "value": "abcd1234", + "domain": "example.com", + "path": "/", + "expires": 1675363572.037711, + "httpOnly": false, + "secure": false, + "sameSite": "None" + } + ], + "origins": [ + { + "origin": "https://example.com", + "localStorage": [ + { "name": "token", "value": "my_auth_token" }, + { "name": "refreshToken", "value": "my_refresh_token" } + ] + } + ] +} +``` + +This JSON sets a `session` cookie and two localStorage entries (`token` and `refreshToken`) for `https://example.com`. + +--- + +### Passing `storage_state` as a Dictionary + +You can directly provide the data as a dictionary: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def main(): + storage_dict = { + "cookies": [ + { + "name": "session", + "value": "abcd1234", + "domain": "example.com", + "path": "/", + "expires": 1675363572.037711, + "httpOnly": False, + "secure": False, + "sameSite": "None" + } + ], + "origins": [ + { + "origin": "https://example.com", + "localStorage": [ + {"name": "token", "value": "my_auth_token"}, + {"name": "refreshToken", "value": "my_refresh_token"} + ] + } + ] + } + + async with AsyncWebCrawler( + headless=True, + storage_state=storage_dict + ) as crawler: + result = await crawler.arun(url='https://example.com/protected') + if result.success: + print("Crawl succeeded with pre-loaded session data!") + print("Page HTML length:", len(result.html)) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +--- + +### Passing `storage_state` as a File + +If you prefer a file-based approach, save the JSON above to `mystate.json` and reference it: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def main(): + async with AsyncWebCrawler( + headless=True, + storage_state="mystate.json" # Uses a JSON file instead of a dictionary + ) as crawler: + result = await crawler.arun(url='https://example.com/protected') + if result.success: + print("Crawl succeeded with pre-loaded session data!") + print("Page HTML length:", len(result.html)) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +--- + +### Using `storage_state` to Avoid Repeated Logins (Sign In Once, Use Later) + +A common scenario is when you need to log in to a site (entering username/password, etc.) to access protected pages. Doing so every crawl is cumbersome. Instead, you can: + +1. Perform the login once in a hook. +2. After login completes, export the resulting `storage_state` to a file. +3. On subsequent runs, provide that `storage_state` to skip the login step. + +**Step-by-Step Example:** + +**First Run (Perform Login and Save State):** + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +async def on_browser_created_hook(browser): + # Access the default context and create a page + context = browser.contexts[0] + page = await context.new_page() + + # Navigate to the login page + await page.goto("https://example.com/login", wait_until="domcontentloaded") + + # Fill in credentials and submit + await page.fill("input[name='username']", "myuser") + await page.fill("input[name='password']", "mypassword") + await page.click("button[type='submit']") + await page.wait_for_load_state("networkidle") + + # Now the site sets tokens in localStorage and cookies + # Export this state to a file so we can reuse it + await context.storage_state(path="my_storage_state.json") + await page.close() + +async def main(): + # First run: perform login and export the storage_state + async with AsyncWebCrawler( + headless=True, + verbose=True, + hooks={"on_browser_created": on_browser_created_hook}, + use_persistent_context=True, + user_data_dir="./my_user_data" + ) as crawler: + + # After on_browser_created_hook runs, we have storage_state saved to my_storage_state.json + result = await crawler.arun( + url='https://example.com/protected-page', + cache_mode=CacheMode.BYPASS, + markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}), + ) + print("First run result success:", result.success) + if result.success: + print("Protected page HTML length:", len(result.html)) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Second Run (Reuse Saved State, No Login Needed):** + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +async def main(): + # Second run: no need to hook on_browser_created this time. + # Just provide the previously saved storage state. + async with AsyncWebCrawler( + headless=True, + verbose=True, + use_persistent_context=True, + user_data_dir="./my_user_data", + storage_state="my_storage_state.json" # Reuse previously exported state + ) as crawler: + + # Now the crawler starts already logged in + result = await crawler.arun( + url='https://example.com/protected-page', + cache_mode=CacheMode.BYPASS, + markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}), + ) + print("Second run result success:", result.success) + if result.success: + print("Protected page HTML length:", len(result.html)) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**What’s Happening Here?** + +- During the first run, the `on_browser_created_hook` logs into the site. +- After logging in, the crawler exports the current session (cookies, localStorage, etc.) to `my_storage_state.json`. +- On subsequent runs, passing `storage_state="my_storage_state.json"` starts the browser context with these tokens already in place, skipping the login steps. + +**Sign Out Scenario:** +If the website allows you to sign out by clearing tokens or by navigating to a sign-out URL, you can also run a script that uses `on_browser_created_hook` or `arun` to simulate signing out, then export the resulting `storage_state` again. That would give you a baseline “logged out” state to start fresh from next time. + +--- + +### Conclusion + +By using `storage_state`, you can skip repetitive actions, like logging in, and jump straight into crawling protected content. Whether you provide a file path or a dictionary, this powerful feature helps maintain state between crawls, simplifying your data extraction pipelines. \ No newline at end of file diff --git a/docs/md_v2/basic/quickstart.md b/docs/md_v2/basic/quickstart.md index 95b8a397..c18cd7d1 100644 --- a/docs/md_v2/basic/quickstart.md +++ b/docs/md_v2/basic/quickstart.md @@ -8,7 +8,7 @@ First, let's import the necessary modules and create an instance of `AsyncWebCra ```python import asyncio -from crawl4ai import AsyncWebCrawler, CasheMode +from crawl4ai import AsyncWebCrawler, CacheMode async def main(): async with AsyncWebCrawler(verbose=True) as crawler: diff --git a/tests/async/test_0.4.2_browser_manager.py b/tests/async/test_0.4.2_browser_manager.py new file mode 100644 index 00000000..9bb19582 --- /dev/null +++ b/tests/async/test_0.4.2_browser_manager.py @@ -0,0 +1,153 @@ +import os, sys +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) +__location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) + +import os, sys +import asyncio +from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.content_filter_strategy import PruningContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +# Assuming that the changes made allow different configurations +# for managed browser, persistent context, and so forth. + +async def test_default_headless(): + async with AsyncWebCrawler( + headless=True, + verbose=True, + user_agent_mode="random", + user_agent_generator_config={"device_type": "mobile", "os_type": "android"}, + use_managed_browser=False, + use_persistent_context=False, + ignore_https_errors=True, + # Testing normal ephemeral context + ) as crawler: + result = await crawler.arun( + url='https://www.kidocode.com/degrees/technology', + cache_mode=CacheMode.BYPASS, + markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}), + ) + print("[test_default_headless] success:", result.success) + print("HTML length:", len(result.html if result.html else "")) + +async def test_managed_browser_persistent(): + # Treating use_persistent_context=True as managed_browser scenario. + async with AsyncWebCrawler( + headless=False, + verbose=True, + user_agent_mode="random", + user_agent_generator_config={"device_type": "desktop", "os_type": "mac"}, + use_managed_browser=True, + use_persistent_context=True, # now should behave same as managed browser + user_data_dir="./outpu/test_profile", + # This should store and reuse profile data across runs + ) as crawler: + result = await crawler.arun( + url='https://www.google.com', + cache_mode=CacheMode.BYPASS, + markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}) + ) + print("[test_managed_browser_persistent] success:", result.success) + print("HTML length:", len(result.html if result.html else "")) + +async def test_session_reuse(): + # Test creating a session, using it for multiple calls + session_id = "my_session" + async with AsyncWebCrawler( + headless=False, + verbose=True, + user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36", + # Fixed user-agent for consistency + use_managed_browser=False, + use_persistent_context=False, + ) as crawler: + + # First call: create session + result1 = await crawler.arun( + url='https://www.example.com', + cache_mode=CacheMode.BYPASS, + session_id=session_id, + markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}) + ) + print("[test_session_reuse first call] success:", result1.success) + + # Second call: same session, possibly cookie retained + result2 = await crawler.arun( + url='https://www.example.com/about', + cache_mode=CacheMode.BYPASS, + session_id=session_id, + markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}) + ) + print("[test_session_reuse second call] success:", result2.success) + +async def test_magic_mode(): + # Test magic mode with override_navigator and simulate_user + async with AsyncWebCrawler( + headless=False, + verbose=True, + user_agent_mode="random", + user_agent_generator_config={"device_type": "desktop", "os_type": "windows"}, + use_managed_browser=False, + use_persistent_context=False, + magic=True, + override_navigator=True, + simulate_user=True, + ) as crawler: + result = await crawler.arun( + url='https://www.kidocode.com/degrees/business', + cache_mode=CacheMode.BYPASS, + markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}) + ) + print("[test_magic_mode] success:", result.success) + print("HTML length:", len(result.html if result.html else "")) + +async def test_proxy_settings(): + # Test with a proxy (if available) to ensure code runs with proxy + async with AsyncWebCrawler( + headless=True, + verbose=False, + user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36", + proxy="http://127.0.0.1:8080", # Assuming local proxy server for test + use_managed_browser=False, + use_persistent_context=False, + ) as crawler: + result = await crawler.arun( + url='https://httpbin.org/ip', + cache_mode=CacheMode.BYPASS, + markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}) + ) + print("[test_proxy_settings] success:", result.success) + if result.success: + print("HTML preview:", result.html[:200] if result.html else "") + +async def test_ignore_https_errors(): + # Test ignore HTTPS errors with a self-signed or invalid cert domain + # This is just conceptual, the domain should be one that triggers SSL error. + # Using a hypothetical URL that fails SSL: + async with AsyncWebCrawler( + headless=True, + verbose=True, + user_agent="Mozilla/5.0", + ignore_https_errors=True, + use_managed_browser=False, + use_persistent_context=False, + ) as crawler: + result = await crawler.arun( + url='https://self-signed.badssl.com/', + cache_mode=CacheMode.BYPASS, + markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}) + ) + print("[test_ignore_https_errors] success:", result.success) + +async def main(): + print("Running tests...") + # await test_default_headless() + # await test_managed_browser_persistent() + # await test_session_reuse() + # await test_magic_mode() + # await test_proxy_settings() + await test_ignore_https_errors() + +if __name__ == "__main__": + asyncio.run(main()) From 5431fa2d0ce78cf933786d2817690d0681583772 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 10 Dec 2024 20:10:39 +0800 Subject: [PATCH 60/70] Add PDF & screenshot functionality, new tutorial - Added support for exporting pages as PDFs - Enhanced screenshot functionality for long pages - Created a tutorial on dynamic content loading with 'Load More' buttons. - Updated web crawler to handle PDF data in responses. --- crawl4ai/async_crawler_strategy.py | 146 +++++++++++++++++++++-- crawl4ai/async_webcrawler.py | 12 ++ crawl4ai/config.py | 3 +- crawl4ai/models.py | 2 + docs/examples/tutorial_dynamic_clicks.md | 117 ++++++++++++++++++ 5 files changed, 271 insertions(+), 9 deletions(-) create mode 100644 docs/examples/tutorial_dynamic_clicks.md diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 1d88c3a8..553e9df4 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -19,8 +19,14 @@ from .js_snippet import load_js_script from .models import AsyncCrawlResponse from .utils import create_box_message from .user_agent_generator import UserAgentGenerator +from .config import SCREENSHOT_HEIGHT_TRESHOLD from playwright_stealth import StealthConfig, stealth_async + +from io import BytesIO +import base64 +from PIL import Image, ImageDraw, ImageFont + stealth_config = StealthConfig( webdriver=True, chrome_app=True, @@ -481,6 +487,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.user_agent = user_agenr_generator.generate( **kwargs.get("user_agent_generator_config", {}) ) + self.pdf = kwargs.get("pdf", False) # New flag + self.screenshot_requested = kwargs.get('screenshot', False) + self.proxy = kwargs.get("proxy") self.proxy_config = kwargs.get("proxy_config") self.headless = kwargs.get("headless", True) @@ -752,7 +761,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """ response_headers = {} status_code = 200 # Default to 200 for local/raw HTML - screenshot_requested = kwargs.get('screenshot', False) + screenshot_requested = kwargs.get("screenshot", self.screenshot_requested) + pdf_requested = kwargs.get("pdf", self.pdf) screenshot_data = None if url.startswith(('http://', 'https://')): @@ -796,6 +806,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): response_headers = {} status_code = None + screenshot_requested = kwargs.get("screenshot", self.screenshot_requested) + pdf_requested = kwargs.get("pdf", self.pdf) + # Reset downloaded files list for new crawl self._downloaded_files = [] @@ -1069,17 +1082,28 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): html = await page.content() await self.execute_hook('before_return_html', page, html, context = context, **kwargs) + start_export_time = time.perf_counter() + pdf_data = None + if pdf_requested: + # Generate PDF once + pdf_data = await self.export_pdf(page) + # Check if kwargs has screenshot=True then take screenshot screenshot_data = None - if kwargs.get("screenshot"): + if screenshot_requested: #kwargs.get("screenshot"): # Check we have screenshot_wait_for parameter, if we have simply wait for that time screenshot_wait_for = kwargs.get("screenshot_wait_for") if screenshot_wait_for: await asyncio.sleep(screenshot_wait_for) - screenshot_data = await self.take_screenshot(page) - - # if self.verbose: - # print(f"[LOG] ✅ Crawled {url} successfully!") + + screenshot_data = await self.take_screenshot(page, **kwargs) + end_export_time = time.perf_counter() + if screenshot_data or pdf_data: + self.logger.info( + message="Exporting PDF and taking screenshot took {duration:.2f}s", + tag="EXPORT", + params={"duration": end_export_time - start_export_time} + ) if self.use_cached_html: cache_file_path = os.path.join( @@ -1105,6 +1129,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): response_headers=response_headers, status_code=status_code, screenshot=screenshot_data, + pdf_data=pdf_data, get_delayed_content=get_delayed_content, downloaded_files=self._downloaded_files if self._downloaded_files else None ) @@ -1181,7 +1206,112 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # if self.verbose: # print(f"Warning: Failed to remove overlay elements: {str(e)}") - async def take_screenshot(self, page: Page) -> str: + async def export_pdf(self, page: Page) -> bytes: + """ + Exports the current page as a PDF. + """ + pdf_data = await page.pdf(print_background=True) + return pdf_data + + async def take_screenshot(self, page, **kwargs) -> str: + page_height = await page.evaluate("document.documentElement.scrollHeight") + if page_height < kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD): + # Page is short enough, just take a screenshot + return await self.take_screenshot_naive(page) + else: + # Page is too long, try to take a full-page screenshot + return await self.take_screenshot_scroller(page, **kwargs) + # return await self.take_screenshot_from_pdf(await self.export_pdf(page)) + + async def take_screenshot_from_pdf(self, pdf_data: bytes) -> str: + """ + Convert the first page of the PDF to a screenshot. + Requires pdf2image and poppler. + """ + try: + from pdf2image import convert_from_bytes + images = convert_from_bytes(pdf_data) + final_img = images[0].convert('RGB') + buffered = BytesIO() + final_img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode('utf-8') + except Exception as e: + error_message = f"Failed to take PDF-based screenshot: {str(e)}" + self.logger.error( + message="PDF Screenshot failed: {error}", + tag="ERROR", + params={"error": error_message} + ) + # Return error image as fallback + img = Image.new('RGB', (800, 600), color='black') + draw = ImageDraw.Draw(img) + font = ImageFont.load_default() + draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + buffered = BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode('utf-8') + + async def take_screenshot_scroller(self, page: Page, **kwargs) -> str: + """ + Attempt to set a large viewport and take a full-page screenshot. + If still too large, segment the page as before. + """ + try: + # Get page height + page_height = await page.evaluate("document.documentElement.scrollHeight") + page_width = await page.evaluate("document.documentElement.scrollWidth") + + # Set a large viewport + large_viewport_height = min(page_height, kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD)) + await page.set_viewport_size({"width": page_width, "height": large_viewport_height}) + + # Page still too long, segment approach + segments = [] + viewport_size = page.viewport_size + viewport_height = viewport_size["height"] + + num_segments = (page_height // viewport_height) + 1 + for i in range(num_segments): + y_offset = i * viewport_height + await page.evaluate(f"window.scrollTo(0, {y_offset})") + await asyncio.sleep(0.01) # wait for render + seg_shot = await page.screenshot(full_page=False) + img = Image.open(BytesIO(seg_shot)).convert('RGB') + segments.append(img) + + total_height = sum(img.height for img in segments) + stitched = Image.new('RGB', (segments[0].width, total_height)) + offset = 0 + for img in segments: + # stitched.paste(img, (0, offset)) + stitched.paste(img.convert('RGB'), (0, offset)) + offset += img.height + + buffered = BytesIO() + stitched = stitched.convert('RGB') + stitched.save(buffered, format="BMP", quality=85) + encoded = base64.b64encode(buffered.getvalue()).decode('utf-8') + + return encoded + except Exception as e: + error_message = f"Failed to take large viewport screenshot: {str(e)}" + self.logger.error( + message="Large viewport screenshot failed: {error}", + tag="ERROR", + params={"error": error_message} + ) + # return error image + img = Image.new('RGB', (800, 600), color='black') + draw = ImageDraw.Draw(img) + font = ImageFont.load_default() + draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + buffered = BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode('utf-8') + finally: + await page.close() + + async def take_screenshot_naive(self, page: Page) -> str: """ Takes a screenshot of the current page. @@ -1193,7 +1323,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """ try: # The page is already loaded, just take the screenshot - screenshot = await page.screenshot(full_page=True) + screenshot = await page.screenshot(full_page=False) return base64.b64encode(screenshot).decode('utf-8') except Exception as e: error_message = f"Failed to take screenshot: {str(e)}" diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 1a4b1333..fc6fe82f 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -147,6 +147,7 @@ class AsyncWebCrawler: # Other parameters css_selector: str = None, screenshot: bool = False, + pdf: bool = False, user_agent: str = None, verbose=True, **kwargs, @@ -230,6 +231,7 @@ class AsyncWebCrawler: async_response: AsyncCrawlResponse = None cached_result = None screenshot_data = None + pdf_data = None extracted_content = None start_time = time.perf_counter() @@ -245,6 +247,10 @@ class AsyncWebCrawler: screenshot_data = cached_result.screenshot if not screenshot_data: cached_result = None + if pdf: + pdf_data = cached_result.pdf + if not pdf_data: + cached_result = None # if verbose: # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s") self.logger.url_status( @@ -264,10 +270,12 @@ class AsyncWebCrawler: async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl( url, screenshot=screenshot, + pdf=pdf, **kwargs ) html = sanitize_input_encode(async_response.html) screenshot_data = async_response.screenshot + pdf_data = async_response.pdf_data t2 = time.perf_counter() self.logger.url_status( url=cache_context.display_url, @@ -289,6 +297,7 @@ class AsyncWebCrawler: content_filter=content_filter, css_selector=css_selector, screenshot=screenshot_data, + pdf_data=pdf_data, verbose=verbose, is_cached=bool(cached_result), async_response=async_response, @@ -362,6 +371,7 @@ class AsyncWebCrawler: bypass_cache: bool = False, css_selector: str = None, screenshot: bool = False, + pdf: bool = False, user_agent: str = None, verbose=True, **kwargs, @@ -550,6 +560,7 @@ class AsyncWebCrawler: ) screenshot = None if not screenshot else screenshot + pdf_data = kwargs.get("pdf_data", None) if kwargs.get("prettiify", False): @@ -567,6 +578,7 @@ class AsyncWebCrawler: links=links, metadata=metadata, screenshot=screenshot, + pdf=pdf_data, extracted_content=extracted_content, success=True, error_message="", diff --git a/crawl4ai/config.py b/crawl4ai/config.py index 786ca4e5..e17ff34f 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -56,4 +56,5 @@ MAX_METRICS_HISTORY = 1000 NEED_MIGRATION = True URL_LOG_SHORTEN_LENGTH = 30 -SHOW_DEPRECATION_WARNINGS = True \ No newline at end of file +SHOW_DEPRECATION_WARNINGS = True +SCREENSHOT_HEIGHT_TRESHOLD = 10000 \ No newline at end of file diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 3a1b8bd1..315069fb 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -23,6 +23,7 @@ class CrawlResult(BaseModel): links: Dict[str, List[Dict]] = {} downloaded_files: Optional[List[str]] = None screenshot: Optional[str] = None + pdf : Optional[bytes] = None markdown: Optional[Union[str, MarkdownGenerationResult]] = None markdown_v2: Optional[MarkdownGenerationResult] = None fit_markdown: Optional[str] = None @@ -39,6 +40,7 @@ class AsyncCrawlResponse(BaseModel): response_headers: Dict[str, str] status_code: int screenshot: Optional[str] = None + pdf_data: Optional[bytes] = None get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None downloaded_files: Optional[List[str]] = None diff --git a/docs/examples/tutorial_dynamic_clicks.md b/docs/examples/tutorial_dynamic_clicks.md new file mode 100644 index 00000000..d9669952 --- /dev/null +++ b/docs/examples/tutorial_dynamic_clicks.md @@ -0,0 +1,117 @@ +# Tutorial: Clicking Buttons to Load More Content with Crawl4AI + +## Introduction + +When scraping dynamic websites, it’s common to encounter “Load More” or “Next” buttons that must be clicked to reveal new content. Crawl4AI provides a straightforward way to handle these situations using JavaScript execution and waiting conditions. In this tutorial, we’ll cover two approaches: + +1. **Step-by-step (Session-based) Approach:** Multiple calls to `arun()` to progressively load more content. +2. **Single-call Approach:** Execute a more complex JavaScript snippet inside a single `arun()` call to handle all clicks at once before the extraction. + +## Prerequisites + +- A working installation of Crawl4AI +- Basic familiarity with Python’s `async`/`await` syntax + +## Step-by-Step Approach + +Use a session ID to maintain state across multiple `arun()` calls: + +```python +from crawl4ai import AsyncWebCrawler, CacheMode + +js_code = [ + # This JS finds the “Next” button and clicks it + "const nextButton = document.querySelector('button.next'); nextButton && nextButton.click();" +] + +wait_for_condition = "css:.new-content-class" + +async with AsyncWebCrawler(headless=True, verbose=True) as crawler: + # 1. Load the initial page + result_initial = await crawler.arun( + url="https://example.com", + cache_mode=CacheMode.BYPASS, + session_id="my_session" + ) + + # 2. Click the 'Next' button and wait for new content + result_next = await crawler.arun( + url="https://example.com", + session_id="my_session", + js_code=js_code, + wait_for=wait_for_condition, + js_only=True, + cache_mode=CacheMode.BYPASS + ) + +# `result_next` now contains the updated HTML after clicking 'Next' +``` + +**Key Points:** +- **`session_id`**: Keeps the same browser context open. +- **`js_code`**: Executes JavaScript in the context of the already loaded page. +- **`wait_for`**: Ensures the crawler waits until new content is fully loaded. +- **`js_only=True`**: Runs the JS in the current session without reloading the page. + +By repeating the `arun()` call multiple times and modifying the `js_code` (e.g., clicking different modules or pages), you can iteratively load all the desired content. + +## Single-call Approach + +If the page allows it, you can run a single `arun()` call with a more elaborate JavaScript snippet that: +- Iterates over all the modules or "Next" buttons +- Clicks them one by one +- Waits for content updates between each click +- Once done, returns control to Crawl4AI for extraction. + +Example snippet: + +```python +from crawl4ai import AsyncWebCrawler, CacheMode + +js_code = [ + # Example JS that clicks multiple modules: + """ + (async () => { + const modules = document.querySelectorAll('.module-item'); + for (let i = 0; i < modules.length; i++) { + modules[i].scrollIntoView(); + modules[i].click(); + // Wait for each module’s content to load, adjust 100ms as needed + await new Promise(r => setTimeout(r, 100)); + } + })(); + """ +] + +async with AsyncWebCrawler(headless=True, verbose=True) as crawler: + result = await crawler.arun( + url="https://example.com", + js_code=js_code, + wait_for="css:.final-loaded-content-class", + cache_mode=CacheMode.BYPASS + ) + +# `result` now contains all content after all modules have been clicked in one go. +``` + +**Key Points:** +- All interactions (clicks and waits) happen before the extraction. +- Ideal for pages where all steps can be done in a single pass. + +## Choosing the Right Approach + +- **Step-by-Step (Session-based)**: + - Good when you need fine-grained control or must dynamically check conditions before clicking the next page. + - Useful if the page requires multiple conditions checked at runtime. + +- **Single-call**: + - Perfect if the sequence of interactions is known in advance. + - Cleaner code if the page’s structure is consistent and predictable. + +## Conclusion + +Crawl4AI makes it easy to handle dynamic content: +- Use session IDs and multiple `arun()` calls for stepwise crawling. +- Or pack all actions into one `arun()` call if the interactions are well-defined upfront. + +This flexibility ensures you can handle a wide range of dynamic web pages efficiently. From 759164831daf69106dc39c7b999601e3bb607132 Mon Sep 17 00:00:00 2001 From: lvzhengri <95766782+lvzhengri@users.noreply.github.com> Date: Tue, 10 Dec 2024 20:56:52 +0800 Subject: [PATCH 61/70] Update async_webcrawler.py (#337) add @asynccontextmanager --- crawl4ai/async_webcrawler.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 2c17602d..9fe4fcc4 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -132,6 +132,11 @@ class AsyncWebCrawler: # if self.verbose: # print(f"{Fore.GREEN}{self.tag_format('READY')} {self.log_icons['READY']} AsyncWebCrawler initialized{Style.RESET_ALL}") + @asynccontextmanager + async def nullcontext(self): + """异步空上下文管理器""" + yield + async def arun( self, url: str, From 5188b7a6a058f6cfe9686a2b98ebad10018f7a5d Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 10 Dec 2024 20:59:31 +0800 Subject: [PATCH 62/70] Add full-page screenshot and PDF export features - Introduced a new approach for capturing full-page screenshots by exporting them as PDFs first, enhancing reliability and performance. - Added documentation for the feature in `docs/examples/full_page_screenshot_and_pdf_export.md`. - Refactored `perform_completion_with_backoff` in `crawl4ai/utils.py` to include necessary extra parameters. - Updated `quickstart_async.py` to utilize LLM extraction with refined arguments. --- crawl4ai/utils.py | 10 ++-- .../full_page_screenshot_and_pdf_export.md | 58 +++++++++++++++++++ docs/examples/quickstart_async.py | 11 +++- 3 files changed, 73 insertions(+), 6 deletions(-) create mode 100644 docs/examples/full_page_screenshot_and_pdf_export.md diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 05a4fbb4..8a12ff0c 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -839,7 +839,11 @@ def perform_completion_with_backoff( max_attempts = 3 base_delay = 2 # Base delay in seconds, you can adjust this based on your needs - extra_args = {} + extra_args = { + "temperature": 0.01, + 'api_key': api_token, + 'base_url': base_url + } if json_response: extra_args["response_format"] = { "type": "json_object" } @@ -848,14 +852,12 @@ def perform_completion_with_backoff( for attempt in range(max_attempts): try: + response =completion( model=provider, messages=[ {"role": "user", "content": prompt_with_variables} ], - temperature=0.01, - api_key=api_token, - base_url=base_url, **extra_args ) return response # Return the successful response diff --git a/docs/examples/full_page_screenshot_and_pdf_export.md b/docs/examples/full_page_screenshot_and_pdf_export.md new file mode 100644 index 00000000..1afc24ba --- /dev/null +++ b/docs/examples/full_page_screenshot_and_pdf_export.md @@ -0,0 +1,58 @@ +# Capturing Full-Page Screenshots and PDFs from Massive Webpages with Crawl4AI + +When dealing with very long web pages, traditional full-page screenshots can be slow or fail entirely. For large pages (like extensive Wikipedia articles), generating a single massive screenshot often leads to delays, memory issues, or style differences. + +**The New Approach:** +We’ve introduced a new feature that effortlessly handles even the biggest pages by first exporting them as a PDF, then converting that PDF into a high-quality image. This approach leverages the browser’s built-in PDF rendering, making it both stable and efficient for very long content. You also have the option to directly save the PDF for your own usage—no need for multiple passes or complex stitching logic. + +**Key Benefits:** +- **Reliability:** The PDF export never times out and works regardless of page length. +- **Versatility:** Get both the PDF and a screenshot in one crawl, without reloading or reprocessing. +- **Performance:** Skips manual scrolling and stitching images, reducing complexity and runtime. + +**Simple Example:** +```python +import os, sys +import asyncio +from crawl4ai import AsyncWebCrawler, CacheMode + +# Adjust paths as needed +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + +async def main(): + async with AsyncWebCrawler() as crawler: + # Request both PDF and screenshot + result = await crawler.arun( + url='https://en.wikipedia.org/wiki/List_of_common_misconceptions', + cache_mode=CacheMode.BYPASS, + pdf=True, + screenshot=True + ) + + if result.success: + # Save screenshot + if result.screenshot: + from base64 import b64decode + with open(os.path.join(__location__, "screenshot.png"), "wb") as f: + f.write(b64decode(result.screenshot)) + + # Save PDF + if result.pdf_data: + pdf_bytes = b64decode(result.pdf_data) + with open(os.path.join(__location__, "page.pdf"), "wb") as f: + f.write(pdf_bytes) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**What Happens Under the Hood:** +- Crawl4AI navigates to the target page. +- If `pdf=True`, it exports the current page as a full PDF, capturing all of its content no matter the length. +- If `screenshot=True`, and a PDF is already available, it directly converts the first page of that PDF to an image for you—no repeated loading or scrolling. +- Finally, you get your PDF and/or screenshot ready to use. + +**Conclusion:** +With this feature, Crawl4AI becomes even more robust and versatile for large-scale content extraction. Whether you need a PDF snapshot or a quick screenshot, you now have a reliable solution for even the most extensive webpages. \ No newline at end of file diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index ac844ed5..1c76bf18 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -117,7 +117,13 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None print(f"API token is required for {provider}. Skipping this example.") return - extra_args = {} + # extra_args = {} + extra_args={ + "temperature": 0, + "top_p": 0.9, + "max_tokens": 2000, + # any other supported parameters for litellm + } if extra_headers: extra_args["extra_headers"] = extra_headers @@ -598,6 +604,8 @@ async def fit_markdown_remove_overlay(): async def main(): + await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) + await simple_crawl() await simple_example_with_running_js_code() await simple_example_with_css_selector() @@ -609,7 +617,6 @@ async def main(): # await extract_structured_data_using_llm() # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) # await extract_structured_data_using_llm("ollama/llama3.2") - await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) # You always can pass custom headers to the extraction strategy # custom_headers = { From 0982c639aee0ab6f8517cd9aecd7a99ef62fa051 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 12 Dec 2024 19:35:09 +0800 Subject: [PATCH 63/70] Enhance AsyncWebCrawler and related configurations - Introduced new configuration classes: BrowserConfig and CrawlerRunConfig. - Refactored AsyncWebCrawler to leverage the new configuration system for cleaner parameter management. - Updated AsyncPlaywrightCrawlerStrategy for better flexibility and reduced legacy parameters. - Improved error handling with detailed context extraction during exceptions. - Enhanced overall maintainability and usability of the web crawler. --- .gitignore | 1 + README.sync.md | 244 -- a.md | 4214 ++++++++++++++++++++ crawl4ai/__init__.py | 17 +- crawl4ai/async_configs.py | 402 ++ crawl4ai/async_crawler_strategy.current.py | 1475 ------- crawl4ai/async_crawler_strategy.py | 863 ++-- crawl4ai/async_database.py | 85 +- crawl4ai/async_webcrawler.py | 922 +++-- crawl4ai/config.py | 4 +- crawl4ai/utils.py | 63 +- docs/examples/quickstart_async.config.py | 517 +++ docs/md_v2/basic/cache-modes.md | 2 +- tests/async/test_0.4.2_config_params.py | 231 ++ 14 files changed, 6373 insertions(+), 2667 deletions(-) delete mode 100644 README.sync.md create mode 100644 a.md create mode 100644 crawl4ai/async_configs.py delete mode 100644 crawl4ai/async_crawler_strategy.current.py create mode 100644 docs/examples/quickstart_async.config.py create mode 100644 tests/async/test_0.4.2_config_params.py diff --git a/.gitignore b/.gitignore index 02c75b3f..432b5aa2 100644 --- a/.gitignore +++ b/.gitignore @@ -206,6 +206,7 @@ pypi_build.sh git_issues.py git_issues.md +.next/ .tests/ .issues/ .docs/ diff --git a/README.sync.md b/README.sync.md deleted file mode 100644 index 6bbef7e4..00000000 --- a/README.sync.md +++ /dev/null @@ -1,244 +0,0 @@ -# Crawl4AI v0.2.77 🕷️🤖 - -[![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers) -[![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members) -[![GitHub Issues](https://img.shields.io/github/issues/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/issues) -[![GitHub Pull Requests](https://img.shields.io/github/issues-pr/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/pulls) -[![License](https://img.shields.io/github/license/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/blob/main/LICENSE) - -Crawl4AI simplifies web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. 🆓🌐 - -#### [v0.2.77] - 2024-08-02 - -Major improvements in functionality, performance, and cross-platform compatibility! 🚀 - -- 🐳 **Docker enhancements**: - - Significantly improved Dockerfile for easy installation on Linux, Mac, and Windows. -- 🌐 **Official Docker Hub image**: - - Launched our first official image on Docker Hub for streamlined deployment (unclecode/crawl4ai). -- 🔧 **Selenium upgrade**: - - Removed dependency on ChromeDriver, now using Selenium's built-in capabilities for better compatibility. -- 🖼️ **Image description**: - - Implemented ability to generate textual descriptions for extracted images from web pages. -- ⚡ **Performance boost**: - - Various improvements to enhance overall speed and performance. - -## Try it Now! - -✨ Play around with this [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1sJPAmeLj5PMrg2VgOwMJ2ubGIcK0cJeX?usp=sharing) - -✨ visit our [Documentation Website](https://crawl4ai.com/mkdocs/) - -✨ Check [Demo](https://crawl4ai.com/mkdocs/demo) - -## Features ✨ - -- 🆓 Completely free and open-source -- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown) -- 🌍 Supports crawling multiple URLs simultaneously -- 🎨 Extracts and returns all media tags (Images, Audio, and Video) -- 🔗 Extracts all external and internal links -- 📚 Extracts metadata from the page -- 🔄 Custom hooks for authentication, headers, and page modifications before crawling -- 🕵️ User-agent customization -- 🖼️ Takes screenshots of the page -- 📜 Executes multiple custom JavaScripts before crawling -- 📚 Various chunking strategies: topic-based, regex, sentence, and more -- 🧠 Advanced extraction strategies: cosine clustering, LLM, and more -- 🎯 CSS selector support -- 📝 Passes instructions/keywords to refine extraction - -# Crawl4AI - -## 🌟 Shoutout to Contributors of v0.2.77! - -A big thank you to the amazing contributors who've made this release possible: - -- [@aravindkarnam](https://github.com/aravindkarnam) for the new image description feature -- [@FractalMind](https://github.com/FractalMind) for our official Docker Hub image -- [@ketonkss4](https://github.com/ketonkss4) for helping streamline our Selenium setup - -Your contributions are driving Crawl4AI forward! 🚀 - -## Cool Examples 🚀 - -### Quick Start - -```python -from crawl4ai import WebCrawler - -# Create an instance of WebCrawler -crawler = WebCrawler() - -# Warm up the crawler (load necessary models) -crawler.warmup() - -# Run the crawler on a URL -result = crawler.run(url="https://www.nbcnews.com/business") - -# Print the extracted content -print(result.markdown) -``` - -## How to install 🛠 - -### Using pip 🐍 -```bash -virtualenv venv -source venv/bin/activate -pip install "crawl4ai @ git+https://github.com/unclecode/crawl4ai.git" -``` - -### Using Docker 🐳 - -```bash -# For Mac users (M1/M2) -# docker build --platform linux/amd64 -t crawl4ai . -docker build -t crawl4ai . -docker run -d -p 8000:80 crawl4ai -``` - -### Using Docker Hub 🐳 - -```bash -docker pull unclecode/crawl4ai:latest -docker run -d -p 8000:80 unclecode/crawl4ai:latest -``` - - -## Speed-First Design 🚀 - -Perhaps the most important design principle for this library is speed. We need to ensure it can handle many links and resources in parallel as quickly as possible. By combining this speed with fast LLMs like Groq, the results will be truly amazing. - -```python -import time -from crawl4ai.web_crawler import WebCrawler -crawler = WebCrawler() -crawler.warmup() - -start = time.time() -url = r"https://www.nbcnews.com/business" -result = crawler.run( url, word_count_threshold=10, bypass_cache=True) -end = time.time() -print(f"Time taken: {end - start}") -``` - -Let's take a look the calculated time for the above code snippet: - -```bash -[LOG] 🚀 Crawling done, success: True, time taken: 1.3623387813568115 seconds -[LOG] 🚀 Content extracted, success: True, time taken: 0.05715131759643555 seconds -[LOG] 🚀 Extraction, time taken: 0.05750393867492676 seconds. -Time taken: 1.439958095550537 -``` -Fetching the content from the page took 1.3623 seconds, and extracting the content took 0.0575 seconds. 🚀 - -### Extract Structured Data from Web Pages 📊 - -Crawl all OpenAI models and their fees from the official page. - -```python -import os -from crawl4ai import WebCrawler -from crawl4ai.extraction_strategy import LLMExtractionStrategy -from pydantic import BaseModel, Field - -class OpenAIModelFee(BaseModel): - model_name: str = Field(..., description="Name of the OpenAI model.") - input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") - output_fee: str = Field(..., description="Fee for output token ßfor the OpenAI model.") - -url = 'https://openai.com/api/pricing/' -crawler = WebCrawler() -crawler.warmup() - -result = crawler.run( - url=url, - word_count_threshold=1, - extraction_strategy= LLMExtractionStrategy( - provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), - schema=OpenAIModelFee.schema(), - extraction_type="schema", - instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. - Do not miss any models in the entire content. One extracted model JSON format should look like this: - {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""" - ), - bypass_cache=True, - ) - -print(result.extracted_content) -``` - -### Execute JS, Filter Data with CSS Selector, and Clustering - -```python -from crawl4ai import WebCrawler -from crawl4ai.chunking_strategy import CosineStrategy - -js_code = ["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"] - -crawler = WebCrawler() -crawler.warmup() - -result = crawler.run( - url="https://www.nbcnews.com/business", - js=js_code, - css_selector="p", - extraction_strategy=CosineStrategy(semantic_filter="technology") -) - -print(result.extracted_content) -``` - -### Extract Structured Data from Web Pages With Proxy and BaseUrl - -```python -from crawl4ai import WebCrawler -from crawl4ai.extraction_strategy import LLMExtractionStrategy - -def create_crawler(): - crawler = WebCrawler(verbose=True, proxy="http://127.0.0.1:7890") - crawler.warmup() - return crawler - -crawler = create_crawler() - -crawler.warmup() - -result = crawler.run( - url="https://www.nbcnews.com/business", - extraction_strategy=LLMExtractionStrategy( - provider="openai/gpt-4o", - api_token="sk-", - base_url="https://api.openai.com/v1" - ) -) - -print(result.markdown) -``` - -## Documentation 📚 - -For detailed documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/). - -## Contributing 🤝 - -We welcome contributions from the open-source community. Check out our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTING.md) for more information. - -## License 📄 - -Crawl4AI is released under the [Apache 2.0 License](https://github.com/unclecode/crawl4ai/blob/main/LICENSE). - -## Contact 📧 - -For questions, suggestions, or feedback, feel free to reach out: - -- GitHub: [unclecode](https://github.com/unclecode) -- Twitter: [@unclecode](https://twitter.com/unclecode) -- Website: [crawl4ai.com](https://crawl4ai.com) - -Happy Crawling! 🕸️🚀 - -## Star History - -[![Star History Chart](https://api.star-history.com/svg?repos=unclecode/crawl4ai&type=Date)](https://star-history.com/#unclecode/crawl4ai&Date) \ No newline at end of file diff --git a/a.md b/a.md new file mode 100644 index 00000000..4d68148f --- /dev/null +++ b/a.md @@ -0,0 +1,4214 @@ +diff --git a/.gitignore b/.gitignore +index 02c75b3..432b5aa 100644 +--- a/.gitignore ++++ b/.gitignore +@@ -206,6 +206,7 @@ pypi_build.sh + git_issues.py + git_issues.md + ++.next/ + .tests/ + .issues/ + .docs/ +diff --git a/README.sync.md b/README.sync.md +deleted file mode 100644 +index 6bbef7e..0000000 +--- a/README.sync.md ++++ /dev/null +@@ -1,244 +0,0 @@ +-# Crawl4AI v0.2.77 🕷️🤖 +- +-[![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers) +-[![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members) +-[![GitHub Issues](https://img.shields.io/github/issues/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/issues) +-[![GitHub Pull Requests](https://img.shields.io/github/issues-pr/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/pulls) +-[![License](https://img.shields.io/github/license/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/blob/main/LICENSE) +- +-Crawl4AI simplifies web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. 🆓🌐 +- +-#### [v0.2.77] - 2024-08-02 +- +-Major improvements in functionality, performance, and cross-platform compatibility! 🚀 +- +-- 🐳 **Docker enhancements**: +- - Significantly improved Dockerfile for easy installation on Linux, Mac, and Windows. +-- 🌐 **Official Docker Hub image**: +- - Launched our first official image on Docker Hub for streamlined deployment (unclecode/crawl4ai). +-- 🔧 **Selenium upgrade**: +- - Removed dependency on ChromeDriver, now using Selenium's built-in capabilities for better compatibility. +-- 🖼️ **Image description**: +- - Implemented ability to generate textual descriptions for extracted images from web pages. +-- ⚡ **Performance boost**: +- - Various improvements to enhance overall speed and performance. +- +-## Try it Now! +- +-✨ Play around with this [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1sJPAmeLj5PMrg2VgOwMJ2ubGIcK0cJeX?usp=sharing) +- +-✨ visit our [Documentation Website](https://crawl4ai.com/mkdocs/) +- +-✨ Check [Demo](https://crawl4ai.com/mkdocs/demo) +- +-## Features ✨ +- +-- 🆓 Completely free and open-source +-- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown) +-- 🌍 Supports crawling multiple URLs simultaneously +-- 🎨 Extracts and returns all media tags (Images, Audio, and Video) +-- 🔗 Extracts all external and internal links +-- 📚 Extracts metadata from the page +-- 🔄 Custom hooks for authentication, headers, and page modifications before crawling +-- 🕵️ User-agent customization +-- 🖼️ Takes screenshots of the page +-- 📜 Executes multiple custom JavaScripts before crawling +-- 📚 Various chunking strategies: topic-based, regex, sentence, and more +-- 🧠 Advanced extraction strategies: cosine clustering, LLM, and more +-- 🎯 CSS selector support +-- 📝 Passes instructions/keywords to refine extraction +- +-# Crawl4AI +- +-## 🌟 Shoutout to Contributors of v0.2.77! +- +-A big thank you to the amazing contributors who've made this release possible: +- +-- [@aravindkarnam](https://github.com/aravindkarnam) for the new image description feature +-- [@FractalMind](https://github.com/FractalMind) for our official Docker Hub image +-- [@ketonkss4](https://github.com/ketonkss4) for helping streamline our Selenium setup +- +-Your contributions are driving Crawl4AI forward! 🚀 +- +-## Cool Examples 🚀 +- +-### Quick Start +- +-```python +-from crawl4ai import WebCrawler +- +-# Create an instance of WebCrawler +-crawler = WebCrawler() +- +-# Warm up the crawler (load necessary models) +-crawler.warmup() +- +-# Run the crawler on a URL +-result = crawler.run(url="https://www.nbcnews.com/business") +- +-# Print the extracted content +-print(result.markdown) +-``` +- +-## How to install 🛠 +- +-### Using pip 🐍 +-```bash +-virtualenv venv +-source venv/bin/activate +-pip install "crawl4ai @ git+https://github.com/unclecode/crawl4ai.git" +-``` +- +-### Using Docker 🐳 +- +-```bash +-# For Mac users (M1/M2) +-# docker build --platform linux/amd64 -t crawl4ai . +-docker build -t crawl4ai . +-docker run -d -p 8000:80 crawl4ai +-``` +- +-### Using Docker Hub 🐳 +- +-```bash +-docker pull unclecode/crawl4ai:latest +-docker run -d -p 8000:80 unclecode/crawl4ai:latest +-``` +- +- +-## Speed-First Design 🚀 +- +-Perhaps the most important design principle for this library is speed. We need to ensure it can handle many links and resources in parallel as quickly as possible. By combining this speed with fast LLMs like Groq, the results will be truly amazing. +- +-```python +-import time +-from crawl4ai.web_crawler import WebCrawler +-crawler = WebCrawler() +-crawler.warmup() +- +-start = time.time() +-url = r"https://www.nbcnews.com/business" +-result = crawler.run( url, word_count_threshold=10, bypass_cache=True) +-end = time.time() +-print(f"Time taken: {end - start}") +-``` +- +-Let's take a look the calculated time for the above code snippet: +- +-```bash +-[LOG] 🚀 Crawling done, success: True, time taken: 1.3623387813568115 seconds +-[LOG] 🚀 Content extracted, success: True, time taken: 0.05715131759643555 seconds +-[LOG] 🚀 Extraction, time taken: 0.05750393867492676 seconds. +-Time taken: 1.439958095550537 +-``` +-Fetching the content from the page took 1.3623 seconds, and extracting the content took 0.0575 seconds. 🚀 +- +-### Extract Structured Data from Web Pages 📊 +- +-Crawl all OpenAI models and their fees from the official page. +- +-```python +-import os +-from crawl4ai import WebCrawler +-from crawl4ai.extraction_strategy import LLMExtractionStrategy +-from pydantic import BaseModel, Field +- +-class OpenAIModelFee(BaseModel): +- model_name: str = Field(..., description="Name of the OpenAI model.") +- input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") +- output_fee: str = Field(..., description="Fee for output token ßfor the OpenAI model.") +- +-url = 'https://openai.com/api/pricing/' +-crawler = WebCrawler() +-crawler.warmup() +- +-result = crawler.run( +- url=url, +- word_count_threshold=1, +- extraction_strategy= LLMExtractionStrategy( +- provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), +- schema=OpenAIModelFee.schema(), +- extraction_type="schema", +- instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. +- Do not miss any models in the entire content. One extracted model JSON format should look like this: +- {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""" +- ), +- bypass_cache=True, +- ) +- +-print(result.extracted_content) +-``` +- +-### Execute JS, Filter Data with CSS Selector, and Clustering +- +-```python +-from crawl4ai import WebCrawler +-from crawl4ai.chunking_strategy import CosineStrategy +- +-js_code = ["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"] +- +-crawler = WebCrawler() +-crawler.warmup() +- +-result = crawler.run( +- url="https://www.nbcnews.com/business", +- js=js_code, +- css_selector="p", +- extraction_strategy=CosineStrategy(semantic_filter="technology") +-) +- +-print(result.extracted_content) +-``` +- +-### Extract Structured Data from Web Pages With Proxy and BaseUrl +- +-```python +-from crawl4ai import WebCrawler +-from crawl4ai.extraction_strategy import LLMExtractionStrategy +- +-def create_crawler(): +- crawler = WebCrawler(verbose=True, proxy="http://127.0.0.1:7890") +- crawler.warmup() +- return crawler +- +-crawler = create_crawler() +- +-crawler.warmup() +- +-result = crawler.run( +- url="https://www.nbcnews.com/business", +- extraction_strategy=LLMExtractionStrategy( +- provider="openai/gpt-4o", +- api_token="sk-", +- base_url="https://api.openai.com/v1" +- ) +-) +- +-print(result.markdown) +-``` +- +-## Documentation 📚 +- +-For detailed documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/). +- +-## Contributing 🤝 +- +-We welcome contributions from the open-source community. Check out our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTING.md) for more information. +- +-## License 📄 +- +-Crawl4AI is released under the [Apache 2.0 License](https://github.com/unclecode/crawl4ai/blob/main/LICENSE). +- +-## Contact 📧 +- +-For questions, suggestions, or feedback, feel free to reach out: +- +-- GitHub: [unclecode](https://github.com/unclecode) +-- Twitter: [@unclecode](https://twitter.com/unclecode) +-- Website: [crawl4ai.com](https://crawl4ai.com) +- +-Happy Crawling! 🕸️🚀 +- +-## Star History +- +-[![Star History Chart](https://api.star-history.com/svg?repos=unclecode/crawl4ai&type=Date)](https://star-history.com/#unclecode/crawl4ai&Date) +\ No newline at end of file +diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py +index cee7c25..d297dfc 100644 +--- a/crawl4ai/__init__.py ++++ b/crawl4ai/__init__.py +@@ -1,7 +1,11 @@ + # __init__.py + + from .async_webcrawler import AsyncWebCrawler, CacheMode +- ++from .async_configs import BrowserConfig, CrawlerRunConfig ++from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy ++from .chunking_strategy import ChunkingStrategy, RegexChunking ++from .markdown_generation_strategy import DefaultMarkdownGenerator ++from .content_filter_strategy import PruningContentFilter, BM25ContentFilter + from .models import CrawlResult + from .__version__ import __version__ + +@@ -9,6 +13,17 @@ __all__ = [ + "AsyncWebCrawler", + "CrawlResult", + "CacheMode", ++ 'BrowserConfig', ++ 'CrawlerRunConfig', ++ 'ExtractionStrategy', ++ 'LLMExtractionStrategy', ++ 'CosineStrategy', ++ 'JsonCssExtractionStrategy', ++ 'ChunkingStrategy', ++ 'RegexChunking', ++ 'DefaultMarkdownGenerator', ++ 'PruningContentFilter', ++ 'BM25ContentFilter', + ] + + def is_sync_version_installed(): +diff --git a/crawl4ai/async_crawler_strategy.current.py b/crawl4ai/async_crawler_strategy.current.py +deleted file mode 100644 +index 6302447..0000000 +--- a/crawl4ai/async_crawler_strategy.current.py ++++ /dev/null +@@ -1,1475 +0,0 @@ +-import asyncio +-import base64 +-import time +-from abc import ABC, abstractmethod +-from typing import Callable, Dict, Any, List, Optional, Awaitable +-import os, sys, shutil +-import tempfile, subprocess +-from playwright.async_api import async_playwright, Page, Browser, Error +-from playwright.async_api import TimeoutError as PlaywrightTimeoutError +-from io import BytesIO +-from PIL import Image, ImageDraw, ImageFont +-from pathlib import Path +-from playwright.async_api import ProxySettings +-from pydantic import BaseModel +-import hashlib +-import json +-import uuid +-from .models import AsyncCrawlResponse +-from .utils import create_box_message +-from .user_agent_generator import UserAgentGenerator +-from playwright_stealth import StealthConfig, stealth_async +- +-stealth_config = StealthConfig( +- webdriver=True, +- chrome_app=True, +- chrome_csi=True, +- chrome_load_times=True, +- chrome_runtime=True, +- navigator_languages=True, +- navigator_plugins=True, +- navigator_permissions=True, +- webgl_vendor=True, +- outerdimensions=True, +- navigator_hardware_concurrency=True, +- media_codecs=True, +-) +- +-BROWSER_DISABLE_OPTIONS = [ +- "--disable-background-networking", +- "--disable-background-timer-throttling", +- "--disable-backgrounding-occluded-windows", +- "--disable-breakpad", +- "--disable-client-side-phishing-detection", +- "--disable-component-extensions-with-background-pages", +- "--disable-default-apps", +- "--disable-extensions", +- "--disable-features=TranslateUI", +- "--disable-hang-monitor", +- "--disable-ipc-flooding-protection", +- "--disable-popup-blocking", +- "--disable-prompt-on-repost", +- "--disable-sync", +- "--force-color-profile=srgb", +- "--metrics-recording-only", +- "--no-first-run", +- "--password-store=basic", +- "--use-mock-keychain" +-] +- +- +-class ManagedBrowser: +- def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None, host: str = "localhost", debugging_port: int = 9222): +- self.browser_type = browser_type +- self.user_data_dir = user_data_dir +- self.headless = headless +- self.browser_process = None +- self.temp_dir = None +- self.debugging_port = debugging_port +- self.host = host +- self.logger = logger +- self.shutting_down = False +- +- async def start(self) -> str: +- """ +- Starts the browser process and returns the CDP endpoint URL. +- If user_data_dir is not provided, creates a temporary directory. +- """ +- +- # Create temp dir if needed +- if not self.user_data_dir: +- self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-") +- self.user_data_dir = self.temp_dir +- +- # Get browser path and args based on OS and browser type +- browser_path = self._get_browser_path() +- args = self._get_browser_args() +- +- # Start browser process +- try: +- self.browser_process = subprocess.Popen( +- args, +- stdout=subprocess.PIPE, +- stderr=subprocess.PIPE +- ) +- # Monitor browser process output for errors +- asyncio.create_task(self._monitor_browser_process()) +- await asyncio.sleep(2) # Give browser time to start +- return f"http://{self.host}:{self.debugging_port}" +- except Exception as e: +- await self.cleanup() +- raise Exception(f"Failed to start browser: {e}") +- +- async def _monitor_browser_process(self): +- """Monitor the browser process for unexpected termination.""" +- if self.browser_process: +- try: +- stdout, stderr = await asyncio.gather( +- asyncio.to_thread(self.browser_process.stdout.read), +- asyncio.to_thread(self.browser_process.stderr.read) +- ) +- +- # Check shutting_down flag BEFORE logging anything +- if self.browser_process.poll() is not None: +- if not self.shutting_down: +- self.logger.error( +- message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", +- tag="ERROR", +- params={ +- "code": self.browser_process.returncode, +- "stdout": stdout.decode(), +- "stderr": stderr.decode() +- } +- ) +- await self.cleanup() +- else: +- self.logger.info( +- message="Browser process terminated normally | Code: {code}", +- tag="INFO", +- params={"code": self.browser_process.returncode} +- ) +- except Exception as e: +- if not self.shutting_down: +- self.logger.error( +- message="Error monitoring browser process: {error}", +- tag="ERROR", +- params={"error": str(e)} +- ) +- +- def _get_browser_path(self) -> str: +- """Returns the browser executable path based on OS and browser type""" +- if sys.platform == "darwin": # macOS +- paths = { +- "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", +- "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox", +- "webkit": "/Applications/Safari.app/Contents/MacOS/Safari" +- } +- elif sys.platform == "win32": # Windows +- paths = { +- "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", +- "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe", +- "webkit": None # WebKit not supported on Windows +- } +- else: # Linux +- paths = { +- "chromium": "google-chrome", +- "firefox": "firefox", +- "webkit": None # WebKit not supported on Linux +- } +- +- return paths.get(self.browser_type) +- +- def _get_browser_args(self) -> List[str]: +- """Returns browser-specific command line arguments""" +- base_args = [self._get_browser_path()] +- +- if self.browser_type == "chromium": +- args = [ +- f"--remote-debugging-port={self.debugging_port}", +- f"--user-data-dir={self.user_data_dir}", +- ] +- if self.headless: +- args.append("--headless=new") +- elif self.browser_type == "firefox": +- args = [ +- "--remote-debugging-port", str(self.debugging_port), +- "--profile", self.user_data_dir, +- ] +- if self.headless: +- args.append("--headless") +- else: +- raise NotImplementedError(f"Browser type {self.browser_type} not supported") +- +- return base_args + args +- +- async def cleanup(self): +- """Cleanup browser process and temporary directory""" +- # Set shutting_down flag BEFORE any termination actions +- self.shutting_down = True +- +- if self.browser_process: +- try: +- self.browser_process.terminate() +- # Wait for process to end gracefully +- for _ in range(10): # 10 attempts, 100ms each +- if self.browser_process.poll() is not None: +- break +- await asyncio.sleep(0.1) +- +- # Force kill if still running +- if self.browser_process.poll() is None: +- self.browser_process.kill() +- await asyncio.sleep(0.1) # Brief wait for kill to take effect +- +- except Exception as e: +- self.logger.error( +- message="Error terminating browser: {error}", +- tag="ERROR", +- params={"error": str(e)} +- ) +- +- if self.temp_dir and os.path.exists(self.temp_dir): +- try: +- shutil.rmtree(self.temp_dir) +- except Exception as e: +- self.logger.error( +- message="Error removing temporary directory: {error}", +- tag="ERROR", +- params={"error": str(e)} +- ) +- +- +-class AsyncCrawlerStrategy(ABC): +- @abstractmethod +- async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: +- pass +- +- @abstractmethod +- async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: +- pass +- +- @abstractmethod +- async def take_screenshot(self, **kwargs) -> str: +- pass +- +- @abstractmethod +- def update_user_agent(self, user_agent: str): +- pass +- +- @abstractmethod +- def set_hook(self, hook_type: str, hook: Callable): +- pass +- +-class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): +- def __init__(self, use_cached_html=False, js_code=None, logger = None, **kwargs): +- self.text_only = kwargs.get("text_only", False) +- self.light_mode = kwargs.get("light_mode", False) +- self.logger = logger +- self.use_cached_html = use_cached_html +- self.viewport_width = kwargs.get("viewport_width", 800 if self.text_only else 1920) +- self.viewport_height = kwargs.get("viewport_height", 600 if self.text_only else 1080) +- +- if self.text_only: +- self.extra_args = kwargs.get("extra_args", []) + [ +- '--disable-images', +- '--disable-javascript', +- '--disable-gpu', +- '--disable-software-rasterizer', +- '--disable-dev-shm-usage' +- ] +- +- self.user_agent = kwargs.get( +- "user_agent", +- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" +- # "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36" +- ) +- user_agenr_generator = UserAgentGenerator() +- if kwargs.get("user_agent_mode") == "random": +- self.user_agent = user_agenr_generator.generate( +- **kwargs.get("user_agent_generator_config", {}) +- ) +- self.proxy = kwargs.get("proxy") +- self.proxy_config = kwargs.get("proxy_config") +- self.headless = kwargs.get("headless", True) +- self.browser_type = kwargs.get("browser_type", "chromium") +- self.headers = kwargs.get("headers", {}) +- self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent) +- self.headers.setdefault("sec-ch-ua", self.browser_hint) +- self.cookies = kwargs.get("cookies", []) +- self.storage_state = kwargs.get("storage_state", None) +- self.sessions = {} +- self.session_ttl = 1800 +- self.js_code = js_code +- self.verbose = kwargs.get("verbose", False) +- self.playwright = None +- self.browser = None +- self.sleep_on_close = kwargs.get("sleep_on_close", False) +- self.use_managed_browser = kwargs.get("use_managed_browser", False) +- self.user_data_dir = kwargs.get("user_data_dir", None) +- self.use_persistent_context = kwargs.get("use_persistent_context", False) +- self.chrome_channel = kwargs.get("chrome_channel", "chrome") +- self.managed_browser = None +- self.default_context = None +- self.hooks = { +- 'on_browser_created': None, +- 'on_user_agent_updated': None, +- 'on_execution_started': None, +- 'before_goto': None, +- 'after_goto': None, +- 'before_return_html': None, +- 'before_retrieve_html': None +- } +- self.extra_args = kwargs.get("extra_args", []) +- self.ignore_https_errors = kwargs.get("ignore_https_errors", True) +- self.java_script_enabled = kwargs.get("java_script_enabled", True) +- self.accept_downloads = kwargs.get("accept_downloads", False) +- self.downloads_path = kwargs.get("downloads_path") +- self._downloaded_files = [] # Track downloaded files for current crawl +- if self.accept_downloads and not self.downloads_path: +- self.downloads_path = os.path.join(os.getcwd(), "downloads") +- os.makedirs(self.downloads_path, exist_ok=True) +- +- +- async def __aenter__(self): +- await self.start() +- return self +- +- async def __aexit__(self, exc_type, exc_val, exc_tb): +- await self.close() +- +- async def start(self): +- if self.playwright is None: +- self.playwright = await async_playwright().start() +- if self.browser is None: +- if self.use_managed_browser: +- # Use managed browser approach +- self.managed_browser = ManagedBrowser( +- browser_type=self.browser_type, +- user_data_dir=self.user_data_dir, +- headless=self.headless, +- logger=self.logger +- ) +- cdp_url = await self.managed_browser.start() +- self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) +- +- # Get the default context that maintains the user profile +- contexts = self.browser.contexts +- if contexts: +- self.default_context = contexts[0] +- else: +- # If no default context exists, create one +- self.default_context = await self.browser.new_context( +- viewport={"width": self.viewport_width, "height": self.viewport_height}, +- storage_state=self.storage_state, +- user_agent= self.user_agent, +- accept_downloads=self.accept_downloads, +- ignore_https_errors=self.ignore_https_errors, +- java_script_enabled=self.java_script_enabled, +- ) +- +- # Set up the default context +- if self.default_context: +- await self.default_context.set_extra_http_headers(self.headers) +- if self.cookies: +- await self.default_context.add_cookies(self.cookies) +- if self.storage_state: +- # If storage_state is a dictionary or file path, Playwright will handle it. +- await self.default_context.storage_state(path=None) # Just ensuring default_context is ready +- if self.accept_downloads: +- await self.default_context.set_default_timeout(60000) +- await self.default_context.set_default_navigation_timeout(60000) +- self.default_context._impl_obj._options["accept_downloads"] = True +- self.default_context._impl_obj._options["downloads_path"] = self.downloads_path +- +- if self.user_agent: +- await self.default_context.set_extra_http_headers({ +- "User-Agent": self.user_agent, +- "sec-ch-ua": self.browser_hint, +- # **self.headers +- }) +- else: +- # Base browser arguments +- browser_args = { +- "headless": self.headless, +- "args": [ +- "--no-sandbox", +- "--disable-dev-shm-usage", +- "--no-first-run", +- "--no-default-browser-check", +- "--disable-infobars", +- "--window-position=0,0", +- "--ignore-certificate-errors", +- "--ignore-certificate-errors-spki-list", +- "--disable-blink-features=AutomationControlled", +- "--window-position=400,0", +- f"--window-size={self.viewport_width},{self.viewport_height}", +- ] +- } +- +- if self.light_mode: +- browser_args["args"].extend(BROWSER_DISABLE_OPTIONS) +- +- if self.text_only: +- browser_args["args"].extend([ +- '--blink-settings=imagesEnabled=false', +- '--disable-remote-fonts' +- ]) +- +- # Add channel if specified (try Chrome first) +- if self.chrome_channel: +- browser_args["channel"] = self.chrome_channel +- +- # Add extra args if provided +- if self.extra_args: +- browser_args["args"].extend(self.extra_args) +- +- # Add downloads path if downloads are enabled +- if self.accept_downloads: +- browser_args["downloads_path"] = self.downloads_path +- +- # Add proxy settings if a proxy is specified +- if self.proxy: +- proxy_settings = ProxySettings(server=self.proxy) +- browser_args["proxy"] = proxy_settings +- elif self.proxy_config: +- proxy_settings = ProxySettings( +- server=self.proxy_config.get("server"), +- username=self.proxy_config.get("username"), +- password=self.proxy_config.get("password") +- ) +- browser_args["proxy"] = proxy_settings +- +- try: +- # Select the appropriate browser based on the browser_type +- if self.browser_type == "firefox": +- self.browser = await self.playwright.firefox.launch(**browser_args) +- elif self.browser_type == "webkit": +- if "viewport" not in browser_args: +- browser_args["viewport"] = {"width": self.viewport_width, "height": self.viewport_height} +- self.browser = await self.playwright.webkit.launch(**browser_args) +- else: +- if self.use_persistent_context and self.user_data_dir: +- self.browser = await self.playwright.chromium.launch_persistent_context( +- user_data_dir=self.user_data_dir, +- accept_downloads=self.accept_downloads, +- downloads_path=self.downloads_path if self.accept_downloads else None, +- **browser_args +- ) +- self.default_context = self.browser +- else: +- self.browser = await self.playwright.chromium.launch(**browser_args) +- self.default_context = self.browser +- +- except Exception as e: +- # Fallback to chromium if Chrome channel fails +- if "chrome" in str(e) and browser_args.get("channel") == "chrome": +- browser_args["channel"] = "chromium" +- if self.use_persistent_context and self.user_data_dir: +- self.browser = await self.playwright.chromium.launch_persistent_context( +- user_data_dir=self.user_data_dir, +- **browser_args +- ) +- self.default_context = self.browser +- else: +- self.browser = await self.playwright.chromium.launch(**browser_args) +- else: +- raise +- +- await self.execute_hook('on_browser_created', self.browser) +- +- async def close(self): +- if self.sleep_on_close: +- await asyncio.sleep(0.5) +- +- # Close all active sessions +- session_ids = list(self.sessions.keys()) +- for session_id in session_ids: +- await self.kill_session(session_id) +- +- if self.browser: +- await self.browser.close() +- self.browser = None +- +- if self.managed_browser: +- await asyncio.sleep(0.5) +- await self.managed_browser.cleanup() +- self.managed_browser = None +- +- if self.playwright: +- await self.playwright.stop() +- self.playwright = None +- +- # Issue #256: Remove __del__ method to avoid potential issues with async cleanup +- # def __del__(self): +- # if self.browser or self.playwright: +- # asyncio.get_event_loop().run_until_complete(self.close()) +- +- def set_hook(self, hook_type: str, hook: Callable): +- if hook_type in self.hooks: +- self.hooks[hook_type] = hook +- else: +- raise ValueError(f"Invalid hook type: {hook_type}") +- +- async def execute_hook(self, hook_type: str, *args, **kwargs): +- hook = self.hooks.get(hook_type) +- if hook: +- if asyncio.iscoroutinefunction(hook): +- return await hook(*args, **kwargs) +- else: +- return hook(*args, **kwargs) +- return args[0] if args else None +- +- def update_user_agent(self, user_agent: str): +- self.user_agent = user_agent +- +- def set_custom_headers(self, headers: Dict[str, str]): +- self.headers = headers +- +- async def kill_session(self, session_id: str): +- if session_id in self.sessions: +- context, page, _ = self.sessions[session_id] +- await page.close() +- if not self.use_managed_browser: +- await context.close() +- del self.sessions[session_id] +- +- def _cleanup_expired_sessions(self): +- current_time = time.time() +- expired_sessions = [ +- sid for sid, (_, _, last_used) in self.sessions.items() +- if current_time - last_used > self.session_ttl +- ] +- for sid in expired_sessions: +- asyncio.create_task(self.kill_session(sid)) +- +- async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): +- wait_for = wait_for.strip() +- +- if wait_for.startswith('js:'): +- # Explicitly specified JavaScript +- js_code = wait_for[3:].strip() +- return await self.csp_compliant_wait(page, js_code, timeout) +- elif wait_for.startswith('css:'): +- # Explicitly specified CSS selector +- css_selector = wait_for[4:].strip() +- try: +- await page.wait_for_selector(css_selector, timeout=timeout) +- except Error as e: +- if 'Timeout' in str(e): +- raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'") +- else: +- raise ValueError(f"Invalid CSS selector: '{css_selector}'") +- else: +- # Auto-detect based on content +- if wait_for.startswith('()') or wait_for.startswith('function'): +- # It's likely a JavaScript function +- return await self.csp_compliant_wait(page, wait_for, timeout) +- else: +- # Assume it's a CSS selector first +- try: +- await page.wait_for_selector(wait_for, timeout=timeout) +- except Error as e: +- if 'Timeout' in str(e): +- raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'") +- else: +- # If it's not a timeout error, it might be an invalid selector +- # Let's try to evaluate it as a JavaScript function as a fallback +- try: +- return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout) +- except Error: +- raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. " +- "It should be either a valid CSS selector, a JavaScript function, " +- "or explicitly prefixed with 'js:' or 'css:'.") +- +- async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000): +- wrapper_js = f""" +- async () => {{ +- const userFunction = {user_wait_function}; +- const startTime = Date.now(); +- while (true) {{ +- if (await userFunction()) {{ +- return true; +- }} +- if (Date.now() - startTime > {timeout}) {{ +- throw new Error('Timeout waiting for condition'); +- }} +- await new Promise(resolve => setTimeout(resolve, 100)); +- }} +- }} +- """ +- +- try: +- await page.evaluate(wrapper_js) +- except TimeoutError: +- raise TimeoutError(f"Timeout after {timeout}ms waiting for condition") +- except Exception as e: +- raise RuntimeError(f"Error in wait condition: {str(e)}") +- +- async def process_iframes(self, page): +- # Find all iframes +- iframes = await page.query_selector_all('iframe') +- +- for i, iframe in enumerate(iframes): +- try: +- # Add a unique identifier to the iframe +- await iframe.evaluate(f'(element) => element.id = "iframe-{i}"') +- +- # Get the frame associated with this iframe +- frame = await iframe.content_frame() +- +- if frame: +- # Wait for the frame to load +- await frame.wait_for_load_state('load', timeout=30000) # 30 seconds timeout +- +- # Extract the content of the iframe's body +- iframe_content = await frame.evaluate('() => document.body.innerHTML') +- +- # Generate a unique class name for this iframe +- class_name = f'extracted-iframe-content-{i}' +- +- # Replace the iframe with a div containing the extracted content +- _iframe = iframe_content.replace('`', '\\`') +- await page.evaluate(f""" +- () => {{ +- const iframe = document.getElementById('iframe-{i}'); +- const div = document.createElement('div'); +- div.innerHTML = `{_iframe}`; +- div.className = '{class_name}'; +- iframe.replaceWith(div); +- }} +- """) +- else: +- # print(f"Warning: Could not access content frame for iframe {i}") +- self.logger.warning( +- message="Could not access content frame for iframe {index}", +- tag="SCRAPE", +- params={"index": i} +- ) +- except Exception as e: +- self.logger.error( +- message="Error processing iframe {index}: {error}", +- tag="ERROR", +- params={"index": i, "error": str(e)} +- ) +- # print(f"Error processing iframe {i}: {str(e)}") +- +- # Return the page object +- return page +- +- async def create_session(self, **kwargs) -> str: +- """Creates a new browser session and returns its ID.""" +- if not self.browser: +- await self.start() +- +- session_id = kwargs.get('session_id') or str(uuid.uuid4()) +- +- if self.use_managed_browser: +- page = await self.default_context.new_page() +- self.sessions[session_id] = (self.default_context, page, time.time()) +- else: +- if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: +- context = self.browser +- page = await context.new_page() +- else: +- context = await self.browser.new_context( +- user_agent=kwargs.get("user_agent", self.user_agent), +- viewport={"width": self.viewport_width, "height": self.viewport_height}, +- proxy={"server": self.proxy} if self.proxy else None, +- accept_downloads=self.accept_downloads, +- storage_state=self.storage_state, +- ignore_https_errors=True +- ) +- +- if self.cookies: +- await context.add_cookies(self.cookies) +- await context.set_extra_http_headers(self.headers) +- page = await context.new_page() +- +- self.sessions[session_id] = (context, page, time.time()) +- +- return session_id +- +- async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: +- """ +- Crawls a given URL or processes raw HTML/local file content based on the URL prefix. +- +- Args: +- url (str): The URL to crawl. Supported prefixes: +- - 'http://' or 'https://': Web URL to crawl. +- - 'file://': Local file path to process. +- - 'raw:': Raw HTML content to process. +- **kwargs: Additional parameters: +- - 'screenshot' (bool): Whether to take a screenshot. +- - ... [other existing parameters] +- +- Returns: +- AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot. +- """ +- response_headers = {} +- status_code = 200 # Default to 200 for local/raw HTML +- screenshot_requested = kwargs.get('screenshot', False) +- screenshot_data = None +- +- if url.startswith(('http://', 'https://')): +- # Proceed with standard web crawling +- return await self._crawl_web(url, **kwargs) +- +- elif url.startswith('file://'): +- # Process local file +- local_file_path = url[7:] # Remove 'file://' prefix +- if not os.path.exists(local_file_path): +- raise FileNotFoundError(f"Local file not found: {local_file_path}") +- with open(local_file_path, 'r', encoding='utf-8') as f: +- html = f.read() +- if screenshot_requested: +- screenshot_data = await self._generate_screenshot_from_html(html) +- return AsyncCrawlResponse( +- html=html, +- response_headers=response_headers, +- status_code=status_code, +- screenshot=screenshot_data, +- get_delayed_content=None +- ) +- +- elif url.startswith('raw:'): +- # Process raw HTML content +- raw_html = url[4:] # Remove 'raw:' prefix +- html = raw_html +- if screenshot_requested: +- screenshot_data = await self._generate_screenshot_from_html(html) +- return AsyncCrawlResponse( +- html=html, +- response_headers=response_headers, +- status_code=status_code, +- screenshot=screenshot_data, +- get_delayed_content=None +- ) +- else: +- raise ValueError("URL must start with 'http://', 'https://', 'file://', or 'raw:'") +- +- +- async def _crawl_web(self, url: str, **kwargs) -> AsyncCrawlResponse: +- """ +- Existing web crawling logic remains unchanged. +- +- Args: +- url (str): The web URL to crawl. +- **kwargs: Additional parameters. +- +- Returns: +- AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot. +- """ +- response_headers = {} +- status_code = None +- +- # Reset downloaded files list for new crawl +- self._downloaded_files = [] +- +- self._cleanup_expired_sessions() +- session_id = kwargs.get("session_id") +- +- # Check if in kwargs we have user_agent that will override the default user_agent +- user_agent = kwargs.get("user_agent", self.user_agent) +- +- # Generate random user agent if magic mode is enabled and user_agent_mode is not random +- if kwargs.get("user_agent_mode") != "random" and kwargs.get("magic", False): +- user_agent = UserAgentGenerator().generate( +- **kwargs.get("user_agent_generator_config", {}) +- ) +- +- # Handle page creation differently for managed browser +- context = None +- if self.use_managed_browser: +- if session_id: +- # Reuse existing session if available +- context, page, _ = self.sessions.get(session_id, (None, None, None)) +- if not page: +- # Create new page in default context if session doesn't exist +- page = await self.default_context.new_page() +- self.sessions[session_id] = (self.default_context, page, time.time()) +- else: +- # Create new page in default context for non-session requests +- page = await self.default_context.new_page() +- else: +- if session_id: +- context, page, _ = self.sessions.get(session_id, (None, None, None)) +- if not context: +- if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: +- # In persistent context, browser is the context +- context = self.browser +- else: +- # Normal context creation for non-persistent or non-Chrome browsers +- context = await self.browser.new_context( +- user_agent=user_agent, +- viewport={"width": self.viewport_width, "height": self.viewport_height}, +- proxy={"server": self.proxy} if self.proxy else None, +- java_script_enabled=True, +- accept_downloads=self.accept_downloads, +- storage_state=self.storage_state, +- # downloads_path=self.downloads_path if self.accept_downloads else None +- ) +- await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) +- if self.cookies: +- await context.add_cookies(self.cookies) +- await context.set_extra_http_headers(self.headers) +- +- page = await context.new_page() +- self.sessions[session_id] = (context, page, time.time()) +- else: +- if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: +- # In persistent context, browser is the context +- context = self.browser +- else: +- # Normal context creation +- context = await self.browser.new_context( +- user_agent=user_agent, +- # viewport={"width": 1920, "height": 1080}, +- viewport={"width": self.viewport_width, "height": self.viewport_height}, +- proxy={"server": self.proxy} if self.proxy else None, +- accept_downloads=self.accept_downloads, +- storage_state=self.storage_state, +- ignore_https_errors=True # Add this line +- ) +- if self.cookies: +- await context.add_cookies(self.cookies) +- await context.set_extra_http_headers(self.headers) +- +- if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False): +- # Inject scripts to override navigator properties +- await context.add_init_script(""" +- // Pass the Permissions Test. +- const originalQuery = window.navigator.permissions.query; +- window.navigator.permissions.query = (parameters) => ( +- parameters.name === 'notifications' ? +- Promise.resolve({ state: Notification.permission }) : +- originalQuery(parameters) +- ); +- Object.defineProperty(navigator, 'webdriver', { +- get: () => undefined +- }); +- window.navigator.chrome = { +- runtime: {}, +- // Add other properties if necessary +- }; +- Object.defineProperty(navigator, 'plugins', { +- get: () => [1, 2, 3, 4, 5], +- }); +- Object.defineProperty(navigator, 'languages', { +- get: () => ['en-US', 'en'], +- }); +- Object.defineProperty(document, 'hidden', { +- get: () => false +- }); +- Object.defineProperty(document, 'visibilityState', { +- get: () => 'visible' +- }); +- """) +- +- page = await context.new_page() +- if kwargs.get("magic", False): +- await stealth_async(page, stealth_config) +- +- # Add console message and error logging +- if kwargs.get("log_console", False): +- page.on("console", lambda msg: print(f"Console: {msg.text}")) +- page.on("pageerror", lambda exc: print(f"Page Error: {exc}")) +- +- try: +- # Set up download handling if enabled +- if self.accept_downloads: +- page.on("download", lambda download: asyncio.create_task(self._handle_download(download))) +- +- if self.use_cached_html: +- cache_file_path = os.path.join( +- os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() +- ) +- if os.path.exists(cache_file_path): +- html = "" +- with open(cache_file_path, "r") as f: +- html = f.read() +- # retrieve response headers and status code from cache +- with open(cache_file_path + ".meta", "r") as f: +- meta = json.load(f) +- response_headers = meta.get("response_headers", {}) +- status_code = meta.get("status_code") +- response = AsyncCrawlResponse( +- html=html, response_headers=response_headers, status_code=status_code +- ) +- return response +- +- if not kwargs.get("js_only", False): +- await self.execute_hook('before_goto', page, context = context, **kwargs) +- +- try: +- response = await page.goto( +- url, +- # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), +- wait_until=kwargs.get("wait_until", "domcontentloaded"), +- timeout=kwargs.get("page_timeout", 60000), +- ) +- except Error as e: +- raise RuntimeError(f"Failed on navigating ACS-GOTO :\n{str(e)}") +- +- await self.execute_hook('after_goto', page, context = context, **kwargs) +- +- # Get status code and headers +- status_code = response.status +- response_headers = response.headers +- else: +- status_code = 200 +- response_headers = {} +- +- # Replace the current wait_for_selector line with this more robust check: +- try: +- # First wait for body to exist, regardless of visibility +- await page.wait_for_selector('body', state='attached', timeout=30000) +- +- # Then wait for it to become visible by checking CSS +- await page.wait_for_function(""" +- () => { +- const body = document.body; +- const style = window.getComputedStyle(body); +- return style.display !== 'none' && +- style.visibility !== 'hidden' && +- style.opacity !== '0'; +- } +- """, timeout=30000) +- +- except Error as e: +- # If waiting fails, let's try to diagnose the issue +- visibility_info = await page.evaluate(""" +- () => { +- const body = document.body; +- const style = window.getComputedStyle(body); +- return { +- display: style.display, +- visibility: style.visibility, +- opacity: style.opacity, +- hasContent: body.innerHTML.length, +- classList: Array.from(body.classList) +- } +- } +- """) +- +- if self.verbose: +- print(f"Body visibility debug info: {visibility_info}") +- +- # Even if body is hidden, we might still want to proceed +- if kwargs.get('ignore_body_visibility', True): +- if self.verbose: +- print("Proceeding despite hidden body...") +- pass +- else: +- raise Error(f"Body element is hidden: {visibility_info}") +- +- # CONTENT LOADING ASSURANCE +- if not self.text_only and (kwargs.get("wait_for_images", True) or kwargs.get("adjust_viewport_to_content", False)): +- # Wait for network idle after initial load and images to load +- # await page.wait_for_load_state("networkidle") +- await page.wait_for_load_state("domcontentloaded") +- await asyncio.sleep(0.1) +- from playwright.async_api import TimeoutError as PlaywrightTimeoutError +- try: +- await page.wait_for_function("Array.from(document.images).every(img => img.complete)", timeout=1000) +- # Check for TimeoutError and ignore it +- except PlaywrightTimeoutError: +- pass +- +- # After initial load, adjust viewport to content size +- if not self.text_only and kwargs.get("adjust_viewport_to_content", False): +- try: +- # Get actual page dimensions +- page_width = await page.evaluate("document.documentElement.scrollWidth") +- page_height = await page.evaluate("document.documentElement.scrollHeight") +- +- target_width = self.viewport_width +- target_height = int(target_width * page_width / page_height * 0.95) +- await page.set_viewport_size({"width": target_width, "height": target_height}) +- +- # Compute scale factor +- # We want the entire page visible: the scale should make both width and height fit +- scale = min(target_width / page_width, target_height / page_height) +- +- # Now we call CDP to set metrics. +- # We tell Chrome that the "device" is page_width x page_height in size, +- # but we scale it down so everything fits within the real viewport. +- cdp = await page.context.new_cdp_session(page) +- await cdp.send('Emulation.setDeviceMetricsOverride', { +- 'width': page_width, # full page width +- 'height': page_height, # full page height +- 'deviceScaleFactor': 1, # keep normal DPR +- 'mobile': False, +- 'scale': scale # scale the entire rendered content +- }) +- +- except Exception as e: +- self.logger.warning( +- message="Failed to adjust viewport to content: {error}", +- tag="VIEWPORT", +- params={"error": str(e)} +- ) +- +- # After viewport adjustment, handle page scanning if requested +- if kwargs.get("scan_full_page", False): +- try: +- viewport_height = page.viewport_size.get("height", self.viewport_height) +- current_position = viewport_height # Start with one viewport height +- scroll_delay = kwargs.get("scroll_delay", 0.2) +- +- # Initial scroll +- await page.evaluate(f"window.scrollTo(0, {current_position})") +- await asyncio.sleep(scroll_delay) +- +- # Get height after first scroll to account for any dynamic content +- total_height = await page.evaluate("document.documentElement.scrollHeight") +- +- while current_position < total_height: +- current_position = min(current_position + viewport_height, total_height) +- await page.evaluate(f"window.scrollTo(0, {current_position})") +- await asyncio.sleep(scroll_delay) +- +- # Check for dynamic content +- new_height = await page.evaluate("document.documentElement.scrollHeight") +- if new_height > total_height: +- total_height = new_height +- +- # Scroll back to top +- await page.evaluate("window.scrollTo(0, 0)") +- +- except Exception as e: +- self.logger.warning( +- message="Failed to perform full page scan: {error}", +- tag="PAGE_SCAN", +- params={"error": str(e)} +- ) +- else: +- # Scroll to the bottom of the page +- await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") +- +- js_code = kwargs.get("js_code", kwargs.get("js", self.js_code)) +- if js_code: +- if isinstance(js_code, str): +- await page.evaluate(js_code) +- elif isinstance(js_code, list): +- for js in js_code: +- await page.evaluate(js) +- +- # await page.wait_for_timeout(100) +- +- # Check for on execution event +- await self.execute_hook('on_execution_started', page, context = context, **kwargs) +- +- if kwargs.get("simulate_user", False) or kwargs.get("magic", False): +- # Simulate user interactions +- await page.mouse.move(100, 100) +- await page.mouse.down() +- await page.mouse.up() +- await page.keyboard.press('ArrowDown') +- +- # Handle the wait_for parameter +- wait_for = kwargs.get("wait_for") +- if wait_for: +- try: +- await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000)) +- except Exception as e: +- raise RuntimeError(f"Wait condition failed: {str(e)}") +- +- # if not wait_for and js_code: +- # await page.wait_for_load_state('networkidle', timeout=5000) +- +- # Update image dimensions +- if not self.text_only: +- update_image_dimensions_js = """ +- () => { +- return new Promise((resolve) => { +- const filterImage = (img) => { +- // Filter out images that are too small +- if (img.width < 100 && img.height < 100) return false; +- +- // Filter out images that are not visible +- const rect = img.getBoundingClientRect(); +- if (rect.width === 0 || rect.height === 0) return false; +- +- // Filter out images with certain class names (e.g., icons, thumbnails) +- if (img.classList.contains('icon') || img.classList.contains('thumbnail')) return false; +- +- // Filter out images with certain patterns in their src (e.g., placeholder images) +- if (img.src.includes('placeholder') || img.src.includes('icon')) return false; +- +- return true; +- }; +- +- const images = Array.from(document.querySelectorAll('img')).filter(filterImage); +- let imagesLeft = images.length; +- +- if (imagesLeft === 0) { +- resolve(); +- return; +- } +- +- const checkImage = (img) => { +- if (img.complete && img.naturalWidth !== 0) { +- img.setAttribute('width', img.naturalWidth); +- img.setAttribute('height', img.naturalHeight); +- imagesLeft--; +- if (imagesLeft === 0) resolve(); +- } +- }; +- +- images.forEach(img => { +- checkImage(img); +- if (!img.complete) { +- img.onload = () => { +- checkImage(img); +- }; +- img.onerror = () => { +- imagesLeft--; +- if (imagesLeft === 0) resolve(); +- }; +- } +- }); +- +- // Fallback timeout of 5 seconds +- // setTimeout(() => resolve(), 5000); +- resolve(); +- }); +- } +- """ +- +- try: +- try: +- await page.wait_for_load_state( +- # state="load", +- state="domcontentloaded", +- timeout=5 +- ) +- except PlaywrightTimeoutError: +- pass +- await page.evaluate(update_image_dimensions_js) +- except Exception as e: +- self.logger.error( +- message="Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {error}", +- tag="ERROR", +- params={"error": str(e)} +- ) +- # raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}") +- +- # Wait a bit for any onload events to complete +- # await page.wait_for_timeout(100) +- +- # Process iframes +- if kwargs.get("process_iframes", False): +- page = await self.process_iframes(page) +- +- await self.execute_hook('before_retrieve_html', page, context = context, **kwargs) +- # Check if delay_before_return_html is set then wait for that time +- delay_before_return_html = kwargs.get("delay_before_return_html", 0.1) +- if delay_before_return_html: +- await asyncio.sleep(delay_before_return_html) +- +- # Check for remove_overlay_elements parameter +- if kwargs.get("remove_overlay_elements", False): +- await self.remove_overlay_elements(page) +- +- html = await page.content() +- await self.execute_hook('before_return_html', page, html, context = context, **kwargs) +- +- # Check if kwargs has screenshot=True then take screenshot +- screenshot_data = None +- if kwargs.get("screenshot"): +- # Check we have screenshot_wait_for parameter, if we have simply wait for that time +- screenshot_wait_for = kwargs.get("screenshot_wait_for") +- if screenshot_wait_for: +- await asyncio.sleep(screenshot_wait_for) +- screenshot_data = await self.take_screenshot(page) +- +- # if self.verbose: +- # print(f"[LOG] ✅ Crawled {url} successfully!") +- +- if self.use_cached_html: +- cache_file_path = os.path.join( +- os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() +- ) +- with open(cache_file_path, "w", encoding="utf-8") as f: +- f.write(html) +- # store response headers and status code in cache +- with open(cache_file_path + ".meta", "w", encoding="utf-8") as f: +- json.dump({ +- "response_headers": response_headers, +- "status_code": status_code +- }, f) +- +- async def get_delayed_content(delay: float = 5.0) -> str: +- if self.verbose: +- print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}") +- await asyncio.sleep(delay) +- return await page.content() +- +- response = AsyncCrawlResponse( +- html=html, +- response_headers=response_headers, +- status_code=status_code, +- screenshot=screenshot_data, +- get_delayed_content=get_delayed_content, +- downloaded_files=self._downloaded_files if self._downloaded_files else None +- ) +- return response +- except Error as e: +- raise Error(f"async_crawler_strategy.py:_crawleb(): {str(e)}") +- # finally: +- # if not session_id: +- # await page.close() +- # await context.close() +- +- async def _handle_download(self, download): +- """Handle file downloads.""" +- try: +- suggested_filename = download.suggested_filename +- download_path = os.path.join(self.downloads_path, suggested_filename) +- +- self.logger.info( +- message="Downloading {filename} to {path}", +- tag="FETCH", +- params={"filename": suggested_filename, "path": download_path} +- ) +- +- start_time = time.perf_counter() +- await download.save_as(download_path) +- end_time = time.perf_counter() +- self._downloaded_files.append(download_path) +- +- self.logger.success( +- message="Downloaded {filename} successfully", +- tag="COMPLETE", +- params={"filename": suggested_filename, "path": download_path, "duration": f"{end_time - start_time:.2f}s"} +- ) +- except Exception as e: +- self.logger.error( +- message="Failed to handle download: {error}", +- tag="ERROR", +- params={"error": str(e)} +- ) +- +- # if self.verbose: +- # print(f"[ERROR] Failed to handle download: {str(e)}") +- +- async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: +- semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed +- semaphore = asyncio.Semaphore(semaphore_count) +- +- async def crawl_with_semaphore(url): +- async with semaphore: +- return await self.crawl(url, **kwargs) +- +- tasks = [crawl_with_semaphore(url) for url in urls] +- results = await asyncio.gather(*tasks, return_exceptions=True) +- return [result if not isinstance(result, Exception) else str(result) for result in results] +- +- async def remove_overlay_elements(self, page: Page) -> None: +- """ +- Removes popup overlays, modals, cookie notices, and other intrusive elements from the page. +- +- Args: +- page (Page): The Playwright page instance +- """ +- remove_overlays_js = """ +- async () => { +- // Function to check if element is visible +- const isVisible = (elem) => { +- const style = window.getComputedStyle(elem); +- return style.display !== 'none' && +- style.visibility !== 'hidden' && +- style.opacity !== '0'; +- }; +- +- // Common selectors for popups and overlays +- const commonSelectors = [ +- // Close buttons first +- 'button[class*="close" i]', 'button[class*="dismiss" i]', +- 'button[aria-label*="close" i]', 'button[title*="close" i]', +- 'a[class*="close" i]', 'span[class*="close" i]', +- +- // Cookie notices +- '[class*="cookie-banner" i]', '[id*="cookie-banner" i]', +- '[class*="cookie-consent" i]', '[id*="cookie-consent" i]', +- +- // Newsletter/subscription dialogs +- '[class*="newsletter" i]', '[class*="subscribe" i]', +- +- // Generic popups/modals +- '[class*="popup" i]', '[class*="modal" i]', +- '[class*="overlay" i]', '[class*="dialog" i]', +- '[role="dialog"]', '[role="alertdialog"]' +- ]; +- +- // Try to click close buttons first +- for (const selector of commonSelectors.slice(0, 6)) { +- const closeButtons = document.querySelectorAll(selector); +- for (const button of closeButtons) { +- if (isVisible(button)) { +- try { +- button.click(); +- await new Promise(resolve => setTimeout(resolve, 100)); +- } catch (e) { +- console.log('Error clicking button:', e); +- } +- } +- } +- } +- +- // Remove remaining overlay elements +- const removeOverlays = () => { +- // Find elements with high z-index +- const allElements = document.querySelectorAll('*'); +- for (const elem of allElements) { +- const style = window.getComputedStyle(elem); +- const zIndex = parseInt(style.zIndex); +- const position = style.position; +- +- if ( +- isVisible(elem) && +- (zIndex > 999 || position === 'fixed' || position === 'absolute') && +- ( +- elem.offsetWidth > window.innerWidth * 0.5 || +- elem.offsetHeight > window.innerHeight * 0.5 || +- style.backgroundColor.includes('rgba') || +- parseFloat(style.opacity) < 1 +- ) +- ) { +- elem.remove(); +- } +- } +- +- // Remove elements matching common selectors +- for (const selector of commonSelectors) { +- const elements = document.querySelectorAll(selector); +- elements.forEach(elem => { +- if (isVisible(elem)) { +- elem.remove(); +- } +- }); +- } +- }; +- +- // Remove overlay elements +- removeOverlays(); +- +- // Remove any fixed/sticky position elements at the top/bottom +- const removeFixedElements = () => { +- const elements = document.querySelectorAll('*'); +- elements.forEach(elem => { +- const style = window.getComputedStyle(elem); +- if ( +- (style.position === 'fixed' || style.position === 'sticky') && +- isVisible(elem) +- ) { +- elem.remove(); +- } +- }); +- }; +- +- removeFixedElements(); +- +- // Remove empty block elements as: div, p, span, etc. +- const removeEmptyBlockElements = () => { +- const blockElements = document.querySelectorAll('div, p, span, section, article, header, footer, aside, nav, main, ul, ol, li, dl, dt, dd, h1, h2, h3, h4, h5, h6'); +- blockElements.forEach(elem => { +- if (elem.innerText.trim() === '') { +- elem.remove(); +- } +- }); +- }; +- +- // Remove margin-right and padding-right from body (often added by modal scripts) +- document.body.style.marginRight = '0px'; +- document.body.style.paddingRight = '0px'; +- document.body.style.overflow = 'auto'; +- +- // Wait a bit for any animations to complete +- await new Promise(resolve => setTimeout(resolve, 100)); +- } +- """ +- +- try: +- await page.evaluate(remove_overlays_js) +- await page.wait_for_timeout(500) # Wait for any animations to complete +- except Exception as e: +- self.logger.warning( +- message="Failed to remove overlay elements: {error}", +- tag="SCRAPE", +- params={"error": str(e)} +- ) +- # if self.verbose: +- # print(f"Warning: Failed to remove overlay elements: {str(e)}") +- +- async def take_screenshot(self, page: Page) -> str: +- """ +- Takes a screenshot of the current page. +- +- Args: +- page (Page): The Playwright page instance +- +- Returns: +- str: Base64-encoded screenshot image +- """ +- try: +- # The page is already loaded, just take the screenshot +- screenshot = await page.screenshot(full_page=True) +- return base64.b64encode(screenshot).decode('utf-8') +- except Exception as e: +- error_message = f"Failed to take screenshot: {str(e)}" +- self.logger.error( +- message="Screenshot failed: {error}", +- tag="ERROR", +- params={"error": error_message} +- ) +- +- +- # Generate an error image +- img = Image.new('RGB', (800, 600), color='black') +- draw = ImageDraw.Draw(img) +- font = ImageFont.load_default() +- draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) +- +- buffered = BytesIO() +- img.save(buffered, format="JPEG") +- return base64.b64encode(buffered.getvalue()).decode('utf-8') +- finally: +- await page.close() +- +- async def export_storage_state(self, path: str = None) -> dict: +- """ +- Exports the current storage state (cookies, localStorage, sessionStorage) +- to a JSON file at the specified path. +- """ +- if self.default_context: +- state = await self.default_context.storage_state(path=path) +- self.logger.info( +- message="Exported storage state to {path}", +- tag="INFO", +- params={"path": path} +- ) +- return state +- else: +- self.logger.warning( +- message="No default_context available to export storage state.", +- tag="WARNING" +- ) +- +- async def _generate_screenshot_from_html(self, html: str) -> Optional[str]: +- """ +- Generates a screenshot from raw HTML content. +- +- Args: +- html (str): The HTML content to render and capture. +- +- Returns: +- Optional[str]: Base64-encoded screenshot image or an error image if failed. +- """ +- try: +- if not self.browser: +- await self.start() +- page = await self.browser.new_page() +- await page.set_content(html, wait_until='networkidle') +- screenshot = await page.screenshot(full_page=True) +- await page.close() +- return base64.b64encode(screenshot).decode('utf-8') +- except Exception as e: +- error_message = f"Failed to take screenshot: {str(e)}" +- # print(error_message) +- self.logger.error( +- message="Screenshot failed: {error}", +- tag="ERROR", +- params={"error": error_message} +- ) +- +- # Generate an error image +- img = Image.new('RGB', (800, 600), color='black') +- draw = ImageDraw.Draw(img) +- font = ImageFont.load_default() +- draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) +- +- buffered = BytesIO() +- img.save(buffered, format="JPEG") +- return base64.b64encode(buffered.getvalue()).decode('utf-8') +- +diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py +index 553e9df..3f040e1 100644 +--- a/crawl4ai/async_crawler_strategy.py ++++ b/crawl4ai/async_crawler_strategy.py +@@ -17,9 +17,10 @@ import json + import uuid + from .js_snippet import load_js_script + from .models import AsyncCrawlResponse +-from .utils import create_box_message ++from .utils import get_error_context + from .user_agent_generator import UserAgentGenerator +-from .config import SCREENSHOT_HEIGHT_TRESHOLD ++from .config import SCREENSHOT_HEIGHT_TRESHOLD, DOWNLOAD_PAGE_TIMEOUT ++from .async_configs import BrowserConfig, CrawlerRunConfig + from playwright_stealth import StealthConfig, stealth_async + + +@@ -64,7 +65,6 @@ BROWSER_DISABLE_OPTIONS = [ + "--use-mock-keychain" + ] + +- + class ManagedBrowser: + def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None, host: str = "localhost", debugging_port: int = 9222): + self.browser_type = browser_type +@@ -225,50 +225,44 @@ class ManagedBrowser: + params={"error": str(e)} + ) + +- + class BrowserManager: +- def __init__(self, use_managed_browser: bool, user_data_dir: Optional[str], headless: bool, logger, browser_type: str, proxy, proxy_config, chrome_channel: str, viewport_width: int, viewport_height: int, accept_downloads: bool, storage_state, ignore_https_errors: bool, java_script_enabled: bool, cookies: List[dict], headers: dict, extra_args: List[str], text_only: bool, light_mode: bool, user_agent: str, browser_hint: str, downloads_path: Optional[str]): +- self.use_managed_browser = use_managed_browser +- self.user_data_dir = user_data_dir +- self.headless = headless ++ def __init__(self, browser_config: BrowserConfig, logger=None): ++ """ ++ Initialize the BrowserManager with a browser configuration. ++ ++ Args: ++ browser_config (BrowserConfig): Configuration object containing all browser settings ++ logger: Logger instance for recording events and errors ++ """ ++ self.config = browser_config + self.logger = logger +- self.browser_type = browser_type +- self.proxy = proxy +- self.proxy_config = proxy_config +- self.chrome_channel = chrome_channel +- self.viewport_width = viewport_width +- self.viewport_height = viewport_height +- self.accept_downloads = accept_downloads +- self.storage_state = storage_state +- self.ignore_https_errors = ignore_https_errors +- self.java_script_enabled = java_script_enabled +- self.cookies = cookies or [] +- self.headers = headers or {} +- self.extra_args = extra_args or [] +- self.text_only = text_only +- self.light_mode = light_mode ++ ++ # Browser state + self.browser = None +- self.default_context : BrowserContext = None ++ self.default_context = None + self.managed_browser = None +- self.sessions = {} +- self.session_ttl = 1800 + self.playwright = None +- self.user_agent = user_agent +- self.browser_hint = browser_hint +- self.downloads_path = downloads_path ++ ++ # Session management ++ self.sessions = {} ++ self.session_ttl = 1800 # 30 minutes ++ ++ # Initialize ManagedBrowser if needed ++ if self.config.use_managed_browser: ++ self.managed_browser = ManagedBrowser( ++ browser_type=self.config.browser_type, ++ user_data_dir=self.config.user_data_dir, ++ headless=self.config.headless, ++ logger=self.logger ++ ) + + async def start(self): ++ """Start the browser instance and set up the default context.""" + if self.playwright is None: + from playwright.async_api import async_playwright + self.playwright = await async_playwright().start() + +- if self.use_managed_browser: +- self.managed_browser = ManagedBrowser( +- browser_type=self.browser_type, +- user_data_dir=self.user_data_dir, +- headless=self.headless, +- logger=self.logger +- ) ++ if self.config.use_managed_browser: + cdp_url = await self.managed_browser.start() + self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) + contexts = self.browser.contexts +@@ -276,142 +270,126 @@ class BrowserManager: + self.default_context = contexts[0] + else: + self.default_context = await self.browser.new_context( +- viewport={"width": self.viewport_width, "height": self.viewport_height}, +- storage_state=self.storage_state, +- user_agent=self.headers.get("User-Agent"), +- accept_downloads=self.accept_downloads, +- ignore_https_errors=self.ignore_https_errors, +- java_script_enabled=self.java_script_enabled ++ viewport={"width": self.config.viewport_width, "height": self.config.viewport_height}, ++ storage_state=self.config.storage_state, ++ user_agent=self.config.headers.get("User-Agent", self.config.user_agent), ++ accept_downloads=self.config.accept_downloads, ++ ignore_https_errors=self.config.ignore_https_errors, ++ java_script_enabled=self.config.java_script_enabled + ) + await self.setup_context(self.default_context) + else: +- browser_args = { +- "headless": self.headless, +- "args": [ +- "--no-sandbox", +- "--disable-dev-shm-usage", +- "--no-first-run", +- "--no-default-browser-check", +- "--disable-infobars", +- "--window-position=0,0", +- "--ignore-certificate-errors", +- "--ignore-certificate-errors-spki-list", +- "--disable-blink-features=AutomationControlled", +- "--window-position=400,0", +- f"--window-size={self.viewport_width},{self.viewport_height}", +- ] +- } +- +- if self.light_mode: +- browser_args["args"].extend(BROWSER_DISABLE_OPTIONS) ++ browser_args = self._build_browser_args() ++ ++ # Launch appropriate browser type ++ if self.config.browser_type == "firefox": ++ self.browser = await self.playwright.firefox.launch(**browser_args) ++ elif self.config.browser_type == "webkit": ++ self.browser = await self.playwright.webkit.launch(**browser_args) ++ else: ++ self.browser = await self.playwright.chromium.launch(**browser_args) + +- if self.text_only: +- browser_args["args"].extend(['--blink-settings=imagesEnabled=false','--disable-remote-fonts']) ++ self.default_context = self.browser + +- if self.chrome_channel: +- browser_args["channel"] = self.chrome_channel ++ def _build_browser_args(self) -> dict: ++ """Build browser launch arguments from config.""" ++ args = [ ++ "--no-sandbox", ++ "--disable-dev-shm-usage", ++ "--no-first-run", ++ "--no-default-browser-check", ++ "--disable-infobars", ++ "--window-position=0,0", ++ "--ignore-certificate-errors", ++ "--ignore-certificate-errors-spki-list", ++ "--disable-blink-features=AutomationControlled", ++ "--window-position=400,0", ++ f"--window-size={self.config.viewport_width},{self.config.viewport_height}", ++ ] + +- if self.extra_args: +- browser_args["args"].extend(self.extra_args) ++ if self.config.light_mode: ++ args.extend(BROWSER_DISABLE_OPTIONS) + +- if self.accept_downloads: +- browser_args["downloads_path"] = os.path.join(os.getcwd(), "downloads") +- os.makedirs(browser_args["downloads_path"], exist_ok=True) ++ if self.config.text_only: ++ args.extend(['--blink-settings=imagesEnabled=false', '--disable-remote-fonts']) + +- if self.proxy: +- from playwright.async_api import ProxySettings +- proxy_settings = ProxySettings(server=self.proxy) +- browser_args["proxy"] = proxy_settings +- elif self.proxy_config: +- from playwright.async_api import ProxySettings +- proxy_settings = ProxySettings( +- server=self.proxy_config.get("server"), +- username=self.proxy_config.get("username"), +- password=self.proxy_config.get("password") +- ) +- browser_args["proxy"] = proxy_settings ++ if self.config.extra_args: ++ args.extend(self.config.extra_args) + +- if self.browser_type == "firefox": +- self.browser = await self.playwright.firefox.launch(**browser_args) +- elif self.browser_type == "webkit": +- self.browser = await self.playwright.webkit.launch(**browser_args) +- else: +- self.browser = await self.playwright.chromium.launch(**browser_args) ++ browser_args = { ++ "headless": self.config.headless, ++ "args": args ++ } + +- self.default_context = self.browser +- # Since default_context in non-managed mode is the browser, no setup needed here. ++ if self.config.chrome_channel: ++ browser_args["channel"] = self.config.chrome_channel ++ ++ if self.config.accept_downloads: ++ browser_args["downloads_path"] = (self.config.downloads_path or ++ os.path.join(os.getcwd(), "downloads")) ++ os.makedirs(browser_args["downloads_path"], exist_ok=True) ++ ++ if self.config.proxy or self.config.proxy_config: ++ from playwright.async_api import ProxySettings ++ proxy_settings = ( ++ ProxySettings(server=self.config.proxy) if self.config.proxy else ++ ProxySettings( ++ server=self.config.proxy_config.get("server"), ++ username=self.config.proxy_config.get("username"), ++ password=self.config.proxy_config.get("password") ++ ) ++ ) ++ browser_args["proxy"] = proxy_settings + ++ return browser_args + +- async def setup_context(self, context : BrowserContext, is_default=False): +- # Set extra headers +- if self.headers: +- await context.set_extra_http_headers(self.headers) ++ async def setup_context(self, context: BrowserContext, is_default=False): ++ """Set up a browser context with the configured options.""" ++ if self.config.headers: ++ await context.set_extra_http_headers(self.config.headers) + +- # Add cookies if any +- if self.cookies: +- await context.add_cookies(self.cookies) ++ if self.config.cookies: ++ await context.add_cookies(self.config.cookies) + +- # Ensure storage_state if provided +- if self.storage_state: +- # If storage_state is a dictionary or file path, Playwright will handle it. ++ if self.config.storage_state: + await context.storage_state(path=None) + +- # If accept_downloads, set timeouts and ensure properties +- if self.accept_downloads: +- await context.set_default_timeout(60000) +- await context.set_default_navigation_timeout(60000) +- if self.downloads_path: ++ if self.config.accept_downloads: ++ context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT) ++ context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT) ++ if self.config.downloads_path: + context._impl_obj._options["accept_downloads"] = True +- context._impl_obj._options["downloads_path"] = self.downloads_path ++ context._impl_obj._options["downloads_path"] = self.config.downloads_path + +- # If we have a user_agent, override it along with sec-ch-ua +- if self.user_agent: +- # Merge headers if needed +- combined_headers = {"User-Agent": self.user_agent, "sec-ch-ua": self.browser_hint} +- combined_headers.update(self.headers) ++ # Handle user agent and browser hints ++ if self.config.user_agent: ++ combined_headers = { ++ "User-Agent": self.config.user_agent, ++ "sec-ch-ua": self.config.browser_hint ++ } ++ combined_headers.update(self.config.headers) + await context.set_extra_http_headers(combined_headers) +- +- async def close(self): +- # Close all active sessions +- session_ids = list(self.sessions.keys()) +- for session_id in session_ids: +- await self.kill_session(session_id) +- +- if self.browser: +- await self.browser.close() +- self.browser = None +- +- if self.managed_browser: +- await asyncio.sleep(0.5) +- await self.managed_browser.cleanup() +- self.managed_browser = None +- +- if self.playwright: +- await self.playwright.stop() +- self.playwright = None + + async def get_page(self, session_id: Optional[str], user_agent: str): +- # Cleanup expired sessions ++ """Get a page for the given session ID, creating a new one if needed.""" + self._cleanup_expired_sessions() + +- if session_id: +- context, page, _ = self.sessions.get(session_id, (None, None, None)) +- if context and page: +- self.sessions[session_id] = (context, page, time.time()) +- return page, context ++ if session_id and session_id in self.sessions: ++ context, page, _ = self.sessions[session_id] ++ self.sessions[session_id] = (context, page, time.time()) ++ return page, context + +- # Create a new context/page pair +- if self.use_managed_browser: ++ if self.config.use_managed_browser: + context = self.default_context + page = await context.new_page() + else: + context = await self.browser.new_context( + user_agent=user_agent, +- viewport={"width": self.viewport_width, "height": self.viewport_height}, +- proxy={"server": self.proxy} if self.proxy else None, +- accept_downloads=self.accept_downloads, +- storage_state=self.storage_state, +- ignore_https_errors=self.ignore_https_errors ++ viewport={"width": self.config.viewport_width, "height": self.config.viewport_height}, ++ proxy={"server": self.config.proxy} if self.config.proxy else None, ++ accept_downloads=self.config.accept_downloads, ++ storage_state=self.config.storage_state, ++ ignore_https_errors=self.config.ignore_https_errors + ) + await self.setup_context(context) + page = await context.new_page() +@@ -422,14 +400,16 @@ class BrowserManager: + return page, context + + async def kill_session(self, session_id: str): ++ """Kill a browser session and clean up resources.""" + if session_id in self.sessions: + context, page, _ = self.sessions[session_id] + await page.close() +- if not self.use_managed_browser: ++ if not self.config.use_managed_browser: + await context.close() + del self.sessions[session_id] + + def _cleanup_expired_sessions(self): ++ """Clean up expired sessions based on TTL.""" + current_time = time.time() + expired_sessions = [ + sid for sid, (_, _, last_used) in self.sessions.items() +@@ -438,6 +418,28 @@ class BrowserManager: + for sid in expired_sessions: + asyncio.create_task(self.kill_session(sid)) + ++ async def close(self): ++ """Close all browser resources and clean up.""" ++ if self.config.sleep_on_close: ++ await asyncio.sleep(0.5) ++ ++ session_ids = list(self.sessions.keys()) ++ for session_id in session_ids: ++ await self.kill_session(session_id) ++ ++ if self.browser: ++ await self.browser.close() ++ self.browser = None ++ ++ if self.managed_browser: ++ await asyncio.sleep(0.5) ++ await self.managed_browser.cleanup() ++ self.managed_browser = None ++ ++ if self.playwright: ++ await self.playwright.stop() ++ self.playwright = None ++ + class AsyncCrawlerStrategy(ABC): + @abstractmethod + async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: +@@ -460,60 +462,24 @@ class AsyncCrawlerStrategy(ABC): + pass + + class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): +- def __init__(self, use_cached_html=False, js_code=None, logger = None, **kwargs): +- self.text_only = kwargs.get("text_only", False) +- self.light_mode = kwargs.get("light_mode", False) ++ def __init__(self, browser_config: BrowserConfig = None, logger = None, **kwargs): ++ """ ++ Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration. ++ ++ Args: ++ browser_config (BrowserConfig): Configuration object containing browser settings. ++ If None, will be created from kwargs for backwards compatibility. ++ logger: Logger instance for recording events and errors. ++ **kwargs: Additional arguments for backwards compatibility and extending functionality. ++ """ ++ # Initialize browser config, either from provided object or kwargs ++ self.browser_config = browser_config or BrowserConfig.from_kwargs(kwargs) + self.logger = logger +- self.use_cached_html = use_cached_html +- self.viewport_width = kwargs.get("viewport_width", 800 if self.text_only else 1920) +- self.viewport_height = kwargs.get("viewport_height", 600 if self.text_only else 1080) + +- if self.text_only: +- self.extra_args = kwargs.get("extra_args", []) + [ +- '--disable-images', +- '--disable-javascript', +- '--disable-gpu', +- '--disable-software-rasterizer', +- '--disable-dev-shm-usage' +- ] +- +- self.user_agent = kwargs.get( +- "user_agent", +- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" +- # "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36" +- ) +- user_agenr_generator = UserAgentGenerator() +- if kwargs.get("user_agent_mode") == "random": +- self.user_agent = user_agenr_generator.generate( +- **kwargs.get("user_agent_generator_config", {}) +- ) +- self.pdf = kwargs.get("pdf", False) # New flag +- self.screenshot_requested = kwargs.get('screenshot', False) ++ # Initialize session management ++ self._downloaded_files = [] + +- self.proxy = kwargs.get("proxy") +- self.proxy_config = kwargs.get("proxy_config") +- self.headless = kwargs.get("headless", True) +- self.browser_type = kwargs.get("browser_type", "chromium") +- self.headers = kwargs.get("headers", {}) +- self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent) +- self.headers.setdefault("sec-ch-ua", self.browser_hint) +- self.cookies = kwargs.get("cookies", []) +- self.storage_state = kwargs.get("storage_state", None) +- self.sessions = {} +- self.session_ttl = 1800 +- self.js_code = js_code +- self.verbose = kwargs.get("verbose", False) +- self.playwright = None +- self.browser = None +- self.sleep_on_close = kwargs.get("sleep_on_close", False) +- self.use_managed_browser = kwargs.get("use_managed_browser", False) +- self.user_data_dir = kwargs.get("user_data_dir", None) +- self.use_persistent_context = kwargs.get("use_persistent_context", False) +- if self.use_persistent_context: +- self.use_managed_browser = True +- self.chrome_channel = kwargs.get("chrome_channel", "chrome") +- self.managed_browser = None +- self.default_context = None ++ # Initialize hooks system + self.hooks = { + 'on_browser_created': None, + 'on_user_agent_updated': None, +@@ -523,40 +489,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + 'before_return_html': None, + 'before_retrieve_html': None + } +- self.extra_args = kwargs.get("extra_args", []) +- self.ignore_https_errors = kwargs.get("ignore_https_errors", True) +- self.java_script_enabled = kwargs.get("java_script_enabled", True) +- self.accept_downloads = kwargs.get("accept_downloads", False) +- self.downloads_path = kwargs.get("downloads_path") +- self._downloaded_files = [] # Track downloaded files for current crawl +- if self.accept_downloads and not self.downloads_path: +- self.downloads_path = os.path.join(os.getcwd(), "downloads") +- os.makedirs(self.downloads_path, exist_ok=True) +- ++ ++ # Initialize browser manager with config + self.browser_manager = BrowserManager( +- use_managed_browser=self.use_managed_browser, +- user_data_dir=self.user_data_dir, +- headless=self.headless, +- logger=self.logger, +- browser_type=self.browser_type, +- proxy=self.proxy, +- proxy_config=self.proxy_config, +- chrome_channel=self.chrome_channel, +- viewport_width=self.viewport_width, +- viewport_height=self.viewport_height, +- accept_downloads=self.accept_downloads, +- storage_state=self.storage_state, +- ignore_https_errors=self.ignore_https_errors, +- java_script_enabled=self.java_script_enabled, +- cookies=self.cookies, +- headers=self.headers, +- extra_args=self.extra_args, +- text_only=self.text_only, +- light_mode=self.light_mode, +- user_agent=self.user_agent, +- browser_hint=self.browser_hint, +- downloads_path=self.downloads_path +- ) ++ browser_config=self.browser_config, ++ logger=self.logger ++ ) + + async def __aenter__(self): + await self.start() +@@ -570,15 +508,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + await self.execute_hook('on_browser_created', self.browser_manager.browser, context = self.browser_manager.default_context) + + async def close(self): +- if self.sleep_on_close: +- await asyncio.sleep(0.5) +- + await self.browser_manager.close() +- +- # Issue #256: Remove __del__ method to avoid potential issues with async cleanup +- # def __del__(self): +- # if self.browser or self.playwright: +- # asyncio.get_event_loop().run_until_complete(self.close()) ++ ++ async def kill_session(self, session_id: str): ++ # Log a warning message and no need kill session, in new version auto kill session ++ self.logger.warning( ++ message="Session auto-kill is enabled in the new version. No need to manually kill sessions.", ++ tag="WARNING" ++ ) ++ await self.browser_manager.kill_session(session_id) + + def set_hook(self, hook_type: str, hook: Callable): + if hook_type in self.hooks: +@@ -600,23 +538,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + + def set_custom_headers(self, headers: Dict[str, str]): + self.headers = headers +- +- async def kill_session(self, session_id: str): +- if session_id in self.sessions: +- context, page, _ = self.sessions[session_id] +- await page.close() +- if not self.use_managed_browser: +- await context.close() +- del self.sessions[session_id] +- +- def _cleanup_expired_sessions(self): +- current_time = time.time() +- expired_sessions = [ +- sid for sid, (_, _, last_used) in self.sessions.items() +- if current_time - last_used > self.session_ttl +- ] +- for sid in expired_sessions: +- asyncio.create_task(self.kill_session(sid)) + + async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): + wait_for = wait_for.strip() +@@ -715,7 +636,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + }} + """) + else: +- # print(f"Warning: Could not access content frame for iframe {i}") + self.logger.warning( + message="Could not access content frame for iframe {index}", + tag="SCRAPE", +@@ -727,7 +647,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + tag="ERROR", + params={"index": i, "error": str(e)} + ) +- # print(f"Error processing iframe {i}: {str(e)}") + + # Return the page object + return page +@@ -743,7 +662,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + page, context = await self.browser_manager.get_page(session_id, user_agent) + return session_id + +- async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: ++ async def crawl(self, url: str, config: CrawlerRunConfig, **kwargs) -> AsyncCrawlResponse: + """ + Crawls a given URL or processes raw HTML/local file content based on the URL prefix. + +@@ -759,15 +678,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + Returns: + AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot. + """ ++ config = config or CrawlerRunConfig.from_kwargs(kwargs) + response_headers = {} +- status_code = 200 # Default to 200 for local/raw HTML +- screenshot_requested = kwargs.get("screenshot", self.screenshot_requested) +- pdf_requested = kwargs.get("pdf", self.pdf) ++ status_code = 200 # Default for local/raw HTML + screenshot_data = None + + if url.startswith(('http://', 'https://')): +- # Proceed with standard web crawling +- return await self._crawl_web(url, **kwargs) ++ return await self._crawl_web(url, config) + + elif url.startswith('file://'): + # Process local file +@@ -776,7 +693,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + raise FileNotFoundError(f"Local file not found: {local_file_path}") + with open(local_file_path, 'r', encoding='utf-8') as f: + html = f.read() +- if screenshot_requested: ++ if config.screenshot: + screenshot_data = await self._generate_screenshot_from_html(html) + return AsyncCrawlResponse( + html=html, +@@ -790,7 +707,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + # Process raw HTML content + raw_html = url[4:] # Remove 'raw:' prefix + html = raw_html +- if screenshot_requested: ++ if config.screenshot: + screenshot_data = await self._generate_screenshot_from_html(html) + return AsyncCrawlResponse( + html=html, +@@ -802,92 +719,85 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + else: + raise ValueError("URL must start with 'http://', 'https://', 'file://', or 'raw:'") + +- async def _crawl_web(self, url: str, **kwargs) -> AsyncCrawlResponse: ++ async def _crawl_web(self, url: str, config: CrawlerRunConfig) -> AsyncCrawlResponse: ++ """ ++ Internal method to crawl web URLs with the specified configuration. ++ ++ Args: ++ url (str): The web URL to crawl ++ config (CrawlerRunConfig): Configuration object controlling the crawl behavior ++ ++ Returns: ++ AsyncCrawlResponse: The response containing HTML, headers, status code, and optional data ++ """ + response_headers = {} + status_code = None + +- screenshot_requested = kwargs.get("screenshot", self.screenshot_requested) +- pdf_requested = kwargs.get("pdf", self.pdf) +- + # Reset downloaded files list for new crawl + self._downloaded_files = [] + +- self._cleanup_expired_sessions() +- session_id = kwargs.get("session_id") +- +- # Check if in kwargs we have user_agent that will override the default user_agent +- user_agent = kwargs.get("user_agent", self.user_agent) +- +- # Generate random user agent if magic mode is enabled and user_agent_mode is not random +- if kwargs.get("user_agent_mode") != "random" and kwargs.get("magic", False): ++ # Handle user agent with magic mode ++ user_agent = self.browser_config.user_agent ++ if config.magic and self.browser_config.user_agent_mode != "random": + user_agent = UserAgentGenerator().generate( +- **kwargs.get("user_agent_generator_config", {}) ++ **(self.browser_config.user_agent_generator_config or {}) + ) + +- # Handle page creation differently for managed browser +- page, context = await self.browser_manager.get_page(session_id, user_agent) ++ # Get page for session ++ page, context = await self.browser_manager.get_page( ++ session_id=config.session_id, ++ user_agent=user_agent ++ ) ++ ++ # Add default cookie + await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) + +- if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False): +- # Inject scripts to override navigator properties ++ # Handle navigator overrides ++ if config.override_navigator or config.simulate_user or config.magic: + await context.add_init_script(load_js_script("navigator_overrider")) + +- # Add console message and error logging +- if kwargs.get("log_console", False): +- page.on("console", lambda msg: print(f"Console: {msg.text}")) +- page.on("pageerror", lambda exc: print(f"Page Error: {exc}")) ++ # Set up console logging if requested ++ if config.log_console: ++ page.on("console", lambda msg: self.logger.debug( ++ message="Console: {msg}", ++ tag="CONSOLE", ++ params={"msg": msg.text} ++ )) ++ page.on("pageerror", lambda exc: self.logger.error( ++ message="Page error: {exc}", ++ tag="ERROR", ++ params={"exc": exc} ++ )) + + try: +- # Set up download handling if enabled +- if self.accept_downloads: ++ # Set up download handling ++ if self.browser_config.accept_downloads: + page.on("download", lambda download: asyncio.create_task(self._handle_download(download))) + +- if self.use_cached_html: +- cache_file_path = os.path.join( +- os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() +- ) +- if os.path.exists(cache_file_path): +- html = "" +- with open(cache_file_path, "r") as f: +- html = f.read() +- # retrieve response headers and status code from cache +- with open(cache_file_path + ".meta", "r") as f: +- meta = json.load(f) +- response_headers = meta.get("response_headers", {}) +- status_code = meta.get("status_code") +- response = AsyncCrawlResponse( +- html=html, response_headers=response_headers, status_code=status_code +- ) +- return response +- +- if not kwargs.get("js_only", False): +- await self.execute_hook('before_goto', page, context = context, **kwargs) ++ # Handle page navigation and content loading ++ if not config.js_only: ++ await self.execute_hook('before_goto', page, context=context) + + try: + response = await page.goto( + url, +- # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), +- wait_until=kwargs.get("wait_until", "domcontentloaded"), +- timeout=kwargs.get("page_timeout", 60000), ++ wait_until=config.wait_until, ++ timeout=config.page_timeout + ) + except Error as e: +- raise RuntimeError(f"Failed on navigating ACS-GOTO :\n{str(e)}") ++ raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") + +- await self.execute_hook('after_goto', page, context = context, **kwargs) ++ await self.execute_hook('after_goto', page, context=context) + +- # Get status code and headers + status_code = response.status + response_headers = response.headers + else: + status_code = 200 + response_headers = {} + +- # Replace the current wait_for_selector line with this more robust check: ++ # Wait for body element and visibility + try: +- # First wait for body to exist, regardless of visibility + await page.wait_for_selector('body', state='attached', timeout=30000) +- +- # Then wait for it to become visible by checking CSS + await page.wait_for_function(""" + () => { + const body = document.body; +@@ -897,9 +807,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + style.opacity !== '0'; + } + """, timeout=30000) +- + except Error as e: +- # If waiting fails, let's try to diagnose the issue + visibility_info = await page.evaluate(""" + () => { + const body = document.body; +@@ -914,233 +822,195 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + } + """) + +- if self.verbose: +- print(f"Body visibility debug info: {visibility_info}") ++ if self.config.verbose: ++ self.logger.debug( ++ message="Body visibility info: {info}", ++ tag="DEBUG", ++ params={"info": visibility_info} ++ ) + +- # Even if body is hidden, we might still want to proceed +- if kwargs.get('ignore_body_visibility', True): +- if self.verbose: +- print("Proceeding despite hidden body...") +- pass +- else: ++ if not config.ignore_body_visibility: + raise Error(f"Body element is hidden: {visibility_info}") +- +- # CONTENT LOADING ASSURANCE +- if not self.text_only and (kwargs.get("wait_for_images", True) or kwargs.get("adjust_viewport_to_content", False)): +- # Wait for network idle after initial load and images to load +- # await page.wait_for_load_state("networkidle") ++ ++ # Handle content loading and viewport adjustment ++ if not self.browser_config.text_only and (config.wait_for_images or config.adjust_viewport_to_content): + await page.wait_for_load_state("domcontentloaded") + await asyncio.sleep(0.1) +- from playwright.async_api import TimeoutError as PlaywrightTimeoutError + try: +- await page.wait_for_function("Array.from(document.images).every(img => img.complete)", timeout=1000) +- # Check for TimeoutError and ignore it ++ await page.wait_for_function( ++ "Array.from(document.images).every(img => img.complete)", ++ timeout=1000 ++ ) + except PlaywrightTimeoutError: + pass +- +- # After initial load, adjust viewport to content size +- if not self.text_only and kwargs.get("adjust_viewport_to_content", False): +- try: +- # Get actual page dimensions ++ ++ # Adjust viewport if needed ++ if not self.browser_config.text_only and config.adjust_viewport_to_content: ++ try: + page_width = await page.evaluate("document.documentElement.scrollWidth") + page_height = await page.evaluate("document.documentElement.scrollHeight") + +- target_width = self.viewport_width ++ target_width = self.browser_config.viewport_width + target_height = int(target_width * page_width / page_height * 0.95) + await page.set_viewport_size({"width": target_width, "height": target_height}) + +- # Compute scale factor +- # We want the entire page visible: the scale should make both width and height fit + scale = min(target_width / page_width, target_height / page_height) +- +- # Now we call CDP to set metrics. +- # We tell Chrome that the "device" is page_width x page_height in size, +- # but we scale it down so everything fits within the real viewport. + cdp = await page.context.new_cdp_session(page) + await cdp.send('Emulation.setDeviceMetricsOverride', { +- 'width': page_width, # full page width +- 'height': page_height, # full page height +- 'deviceScaleFactor': 1, # keep normal DPR ++ 'width': page_width, ++ 'height': page_height, ++ 'deviceScaleFactor': 1, + 'mobile': False, +- 'scale': scale # scale the entire rendered content ++ 'scale': scale + }) +- + except Exception as e: + self.logger.warning( + message="Failed to adjust viewport to content: {error}", + tag="VIEWPORT", + params={"error": str(e)} +- ) +- +- # After viewport adjustment, handle page scanning if requested +- if kwargs.get("scan_full_page", False): +- try: +- viewport_height = page.viewport_size.get("height", self.viewport_height) +- current_position = viewport_height # Start with one viewport height +- scroll_delay = kwargs.get("scroll_delay", 0.2) +- +- # Initial scroll +- await page.evaluate(f"window.scrollTo(0, {current_position})") +- await asyncio.sleep(scroll_delay) +- +- # Get height after first scroll to account for any dynamic content +- total_height = await page.evaluate("document.documentElement.scrollHeight") +- +- while current_position < total_height: +- current_position = min(current_position + viewport_height, total_height) +- await page.evaluate(f"window.scrollTo(0, {current_position})") +- await asyncio.sleep(scroll_delay) +- +- # Check for dynamic content +- new_height = await page.evaluate("document.documentElement.scrollHeight") +- if new_height > total_height: +- total_height = new_height +- +- # Scroll back to top +- await page.evaluate("window.scrollTo(0, 0)") +- +- except Exception as e: +- self.logger.warning( +- message="Failed to perform full page scan: {error}", +- tag="PAGE_SCAN", +- params={"error": str(e)} + ) +- else: +- # Scroll to the bottom of the page +- await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") +- +- js_code = kwargs.get("js_code", kwargs.get("js", self.js_code)) +- if js_code: +- if isinstance(js_code, str): +- await page.evaluate(js_code) +- elif isinstance(js_code, list): +- for js in js_code: ++ ++ # Handle full page scanning ++ if config.scan_full_page: ++ await self._handle_full_page_scan(page, config.scroll_delay) ++ ++ # Execute JavaScript if provided ++ if config.js_code: ++ if isinstance(config.js_code, str): ++ await page.evaluate(config.js_code) ++ elif isinstance(config.js_code, list): ++ for js in config.js_code: + await page.evaluate(js) + +- # await page.wait_for_timeout(100) +- +- # Check for on execution event +- await self.execute_hook('on_execution_started', page, context = context, **kwargs) +- +- if kwargs.get("simulate_user", False) or kwargs.get("magic", False): +- # Simulate user interactions ++ await self.execute_hook('on_execution_started', page, context=context) ++ ++ # Handle user simulation ++ if config.simulate_user or config.magic: + await page.mouse.move(100, 100) + await page.mouse.down() + await page.mouse.up() + await page.keyboard.press('ArrowDown') + +- # Handle the wait_for parameter +- wait_for = kwargs.get("wait_for") +- if wait_for: ++ # Handle wait_for condition ++ if config.wait_for: + try: +- await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000)) ++ await self.smart_wait(page, config.wait_for, timeout=config.page_timeout) + except Exception as e: + raise RuntimeError(f"Wait condition failed: {str(e)}") +- +- # if not wait_for and js_code: +- # await page.wait_for_load_state('networkidle', timeout=5000) + +- # Update image dimensions +- if not self.text_only: ++ # Update image dimensions if needed ++ if not self.browser_config.text_only: + update_image_dimensions_js = load_js_script("update_image_dimensions") +- + try: + try: +- await page.wait_for_load_state( +- # state="load", +- state="domcontentloaded", +- timeout=5 +- ) ++ await page.wait_for_load_state("domcontentloaded", timeout=5) + except PlaywrightTimeoutError: + pass + await page.evaluate(update_image_dimensions_js) + except Exception as e: + self.logger.error( +- message="Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {error}", ++ message="Error updating image dimensions: {error}", + tag="ERROR", + params={"error": str(e)} + ) +- # raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}") + +- # Wait a bit for any onload events to complete +- # await page.wait_for_timeout(100) +- +- # Process iframes +- if kwargs.get("process_iframes", False): ++ # Process iframes if needed ++ if config.process_iframes: + page = await self.process_iframes(page) +- +- await self.execute_hook('before_retrieve_html', page, context = context, **kwargs) +- # Check if delay_before_return_html is set then wait for that time +- delay_before_return_html = kwargs.get("delay_before_return_html", 0.1) +- if delay_before_return_html: +- await asyncio.sleep(delay_before_return_html) +- +- # Check for remove_overlay_elements parameter +- if kwargs.get("remove_overlay_elements", False): ++ ++ # Pre-content retrieval hooks and delay ++ await self.execute_hook('before_retrieve_html', page, context=context) ++ if config.delay_before_return_html: ++ await asyncio.sleep(config.delay_before_return_html) ++ ++ # Handle overlay removal ++ if config.remove_overlay_elements: + await self.remove_overlay_elements(page) +- ++ ++ # Get final HTML content + html = await page.content() +- await self.execute_hook('before_return_html', page, html, context = context, **kwargs) +- ++ await self.execute_hook('before_return_html', page, html, context=context) ++ ++ # Handle PDF and screenshot generation + start_export_time = time.perf_counter() + pdf_data = None +- if pdf_requested: +- # Generate PDF once +- pdf_data = await self.export_pdf(page) +- +- # Check if kwargs has screenshot=True then take screenshot + screenshot_data = None +- if screenshot_requested: #kwargs.get("screenshot"): +- # Check we have screenshot_wait_for parameter, if we have simply wait for that time +- screenshot_wait_for = kwargs.get("screenshot_wait_for") +- if screenshot_wait_for: +- await asyncio.sleep(screenshot_wait_for) +- +- screenshot_data = await self.take_screenshot(page, **kwargs) +- end_export_time = time.perf_counter() ++ ++ if config.pdf: ++ pdf_data = await self.export_pdf(page) ++ ++ if config.screenshot: ++ if config.screenshot_wait_for: ++ await asyncio.sleep(config.screenshot_wait_for) ++ screenshot_data = await self.take_screenshot( ++ page, ++ screenshot_height_threshold=config.screenshot_height_threshold ++ ) ++ + if screenshot_data or pdf_data: + self.logger.info( + message="Exporting PDF and taking screenshot took {duration:.2f}s", + tag="EXPORT", +- params={"duration": end_export_time - start_export_time} ++ params={"duration": time.perf_counter() - start_export_time} + ) +- +- if self.use_cached_html: +- cache_file_path = os.path.join( +- os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() +- ) +- with open(cache_file_path, "w", encoding="utf-8") as f: +- f.write(html) +- # store response headers and status code in cache +- with open(cache_file_path + ".meta", "w", encoding="utf-8") as f: +- json.dump({ +- "response_headers": response_headers, +- "status_code": status_code +- }, f) + ++ # Define delayed content getter + async def get_delayed_content(delay: float = 5.0) -> str: +- if self.verbose: +- print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}") ++ if self.config.verbose: ++ self.logger.info( ++ message="Waiting for {delay} seconds before retrieving content for {url}", ++ tag="INFO", ++ params={"delay": delay, "url": url} ++ ) + await asyncio.sleep(delay) + return await page.content() +- +- response = AsyncCrawlResponse( +- html=html, +- response_headers=response_headers, ++ ++ # Return complete response ++ return AsyncCrawlResponse( ++ html=html, ++ response_headers=response_headers, + status_code=status_code, + screenshot=screenshot_data, + pdf_data=pdf_data, + get_delayed_content=get_delayed_content, + downloaded_files=self._downloaded_files if self._downloaded_files else None + ) +- return response +- except Error as e: +- raise Error(f"async_crawler_strategy.py:_crawleb(): {str(e)}") +- # finally: +- # if not session_id: +- # await page.close() +- # await context.close() + ++ except Exception as e: ++ raise e ++ ++ async def _handle_full_page_scan(self, page: Page, scroll_delay: float): ++ """Helper method to handle full page scanning""" ++ try: ++ viewport_height = page.viewport_size.get("height", self.browser_config.viewport_height) ++ current_position = viewport_height ++ ++ await page.evaluate(f"window.scrollTo(0, {current_position})") ++ await asyncio.sleep(scroll_delay) ++ ++ total_height = await page.evaluate("document.documentElement.scrollHeight") ++ ++ while current_position < total_height: ++ current_position = min(current_position + viewport_height, total_height) ++ await page.evaluate(f"window.scrollTo(0, {current_position})") ++ await asyncio.sleep(scroll_delay) ++ ++ new_height = await page.evaluate("document.documentElement.scrollHeight") ++ if new_height > total_height: ++ total_height = new_height ++ ++ await page.evaluate("window.scrollTo(0, 0)") ++ ++ except Exception as e: ++ self.logger.warning( ++ message="Failed to perform full page scan: {error}", ++ tag="PAGE_SCAN", ++ params={"error": str(e)} ++ ) ++ else: ++ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") ++ ++ + async def _handle_download(self, download): + """Handle file downloads.""" + try: +@@ -1170,8 +1040,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + params={"error": str(e)} + ) + +- # if self.verbose: +- # print(f"[ERROR] Failed to handle download: {str(e)}") + + async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: + semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed +@@ -1192,7 +1060,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + Args: + page (Page): The Playwright page instance + """ +- remove_overlays_js = load_js_script("remove_overlays") ++ remove_overlays_js = load_js_script("remove_overlay_elements") + + try: + await page.evaluate(remove_overlays_js) +@@ -1203,8 +1071,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + tag="SCRAPE", + params={"error": str(e)} + ) +- # if self.verbose: +- # print(f"Warning: Failed to remove overlay elements: {str(e)}") + + async def export_pdf(self, page: Page) -> bytes: + """ +@@ -1386,7 +1252,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + return base64.b64encode(screenshot).decode('utf-8') + except Exception as e: + error_message = f"Failed to take screenshot: {str(e)}" +- # print(error_message) + self.logger.error( + message="Screenshot failed: {error}", + tag="ERROR", +diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py +index 3c97e7d..5cdafac 100644 +--- a/crawl4ai/async_database.py ++++ b/crawl4ai/async_database.py +@@ -1,4 +1,4 @@ +-import os ++import os, sys + from pathlib import Path + import aiosqlite + import asyncio +@@ -13,6 +13,7 @@ import aiofiles + from .config import NEED_MIGRATION + from .version_manager import VersionManager + from .async_logger import AsyncLogger ++from .utils import get_error_context, create_box_message + # Set up logging + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger(__name__) +@@ -97,35 +98,84 @@ class AsyncDatabaseManager: + + @asynccontextmanager + async def get_connection(self): +- """Connection pool manager""" ++ """Connection pool manager with enhanced error handling""" + if not self._initialized: +- # Use an asyncio.Lock to ensure only one initialization occurs + async with self.init_lock: + if not self._initialized: +- await self.initialize() +- self._initialized = True ++ try: ++ await self.initialize() ++ self._initialized = True ++ except Exception as e: ++ import sys ++ error_context = get_error_context(sys.exc_info()) ++ self.logger.error( ++ message="Database initialization failed:\n{error}\n\nContext:\n{context}\n\nTraceback:\n{traceback}", ++ tag="ERROR", ++ force_verbose=True, ++ params={ ++ "error": str(e), ++ "context": error_context["code_context"], ++ "traceback": error_context["full_traceback"] ++ } ++ ) ++ raise + + await self.connection_semaphore.acquire() + task_id = id(asyncio.current_task()) ++ + try: + async with self.pool_lock: + if task_id not in self.connection_pool: +- conn = await aiosqlite.connect( +- self.db_path, +- timeout=30.0 +- ) +- await conn.execute('PRAGMA journal_mode = WAL') +- await conn.execute('PRAGMA busy_timeout = 5000') +- self.connection_pool[task_id] = conn ++ try: ++ conn = await aiosqlite.connect( ++ self.db_path, ++ timeout=30.0 ++ ) ++ await conn.execute('PRAGMA journal_mode = WAL') ++ await conn.execute('PRAGMA busy_timeout = 5000') ++ ++ # Verify database structure ++ async with conn.execute("PRAGMA table_info(crawled_data)") as cursor: ++ columns = await cursor.fetchall() ++ column_names = [col[1] for col in columns] ++ expected_columns = { ++ 'url', 'html', 'cleaned_html', 'markdown', 'extracted_content', ++ 'success', 'media', 'links', 'metadata', 'screenshot', ++ 'response_headers', 'downloaded_files' ++ } ++ missing_columns = expected_columns - set(column_names) ++ if missing_columns: ++ raise ValueError(f"Database missing columns: {missing_columns}") ++ ++ self.connection_pool[task_id] = conn ++ except Exception as e: ++ import sys ++ error_context = get_error_context(sys.exc_info()) ++ error_message = ( ++ f"Unexpected error in db get_connection at line {error_context['line_no']} " ++ f"in {error_context['function']} ({error_context['filename']}):\n" ++ f"Error: {str(e)}\n\n" ++ f"Code context:\n{error_context['code_context']}" ++ ) ++ self.logger.error( ++ message=create_box_message(error_message, type= "error"), ++ ) ++ ++ raise + + yield self.connection_pool[task_id] + + except Exception as e: ++ import sys ++ error_context = get_error_context(sys.exc_info()) ++ error_message = ( ++ f"Unexpected error in db get_connection at line {error_context['line_no']} " ++ f"in {error_context['function']} ({error_context['filename']}):\n" ++ f"Error: {str(e)}\n\n" ++ f"Code context:\n{error_context['code_context']}" ++ ) + self.logger.error( +- message="Connection error: {error}", +- tag="ERROR", +- force_verbose=True, +- params={"error": str(e)} ++ message=create_box_message(error_message, type= "error"), + ) + raise + finally: +@@ -230,7 +280,8 @@ class AsyncDatabaseManager: + 'cleaned_html': row_dict['cleaned_html'], + 'markdown': row_dict['markdown'], + 'extracted_content': row_dict['extracted_content'], +- 'screenshot': row_dict['screenshot'] ++ 'screenshot': row_dict['screenshot'], ++ 'screenshots': row_dict['screenshot'], + } + + for field, hash_value in content_fields.items(): +diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py +index fc6fe82..72ef0bf 100644 +--- a/crawl4ai/async_webcrawler.py ++++ b/crawl4ai/async_webcrawler.py +@@ -1,4 +1,4 @@ +-import os ++import os, sys + import time + import warnings + from enum import Enum +@@ -17,7 +17,7 @@ from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawler + from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode + from .content_scraping_strategy import WebScrapingStrategy + from .async_logger import AsyncLogger +- ++from .async_configs import BrowserConfig, CrawlerRunConfig + from .config import ( + MIN_WORD_THRESHOLD, + IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, +@@ -40,31 +40,20 @@ class AsyncWebCrawler: + """ + Asynchronous web crawler with flexible caching capabilities. + +- Migration Guide (from version X.X.X): ++ Migration Guide: + Old way (deprecated): +- crawler = AsyncWebCrawler(always_by_pass_cache=True) +- result = await crawler.arun( +- url="https://example.com", +- bypass_cache=True, +- no_cache_read=True, +- no_cache_write=False +- ) ++ crawler = AsyncWebCrawler(always_by_pass_cache=True, browser_type="chromium", headless=True) + + New way (recommended): +- crawler = AsyncWebCrawler(always_bypass_cache=True) +- result = await crawler.arun( +- url="https://example.com", +- cache_mode=CacheMode.WRITE_ONLY +- ) +- +- To disable deprecation warnings: +- Pass warning=False to suppress the warning. ++ browser_config = BrowserConfig(browser_type="chromium", headless=True) ++ crawler = AsyncWebCrawler(browser_config=browser_config) + """ + _domain_last_hit = {} + + def __init__( + self, + crawler_strategy: Optional[AsyncCrawlerStrategy] = None, ++ config: Optional[BrowserConfig] = None, + always_bypass_cache: bool = False, + always_by_pass_cache: Optional[bool] = None, # Deprecated parameter + base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), +@@ -75,28 +64,48 @@ class AsyncWebCrawler: + Initialize the AsyncWebCrawler. + + Args: +- crawler_strategy: Strategy for crawling web pages ++ crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy ++ config: Configuration object for browser settings. If None, will be created from kwargs + always_bypass_cache: Whether to always bypass cache (new parameter) + always_by_pass_cache: Deprecated, use always_bypass_cache instead + base_directory: Base directory for storing cache ++ thread_safe: Whether to use thread-safe operations ++ **kwargs: Additional arguments for backwards compatibility + """ +- self.verbose = kwargs.get("verbose", False) ++ # Handle browser configuration ++ browser_config = config ++ if browser_config is not None: ++ if any(k in kwargs for k in ["browser_type", "headless", "viewport_width", "viewport_height"]): ++ self.logger.warning( ++ message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.", ++ tag="WARNING" ++ ) ++ else: ++ # Create browser config from kwargs for backwards compatibility ++ browser_config = BrowserConfig.from_kwargs(kwargs) ++ ++ self.browser_config = browser_config ++ ++ # Initialize logger first since other components may need it + self.logger = AsyncLogger( + log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"), +- verbose=self.verbose, ++ verbose=self.browser_config.verbose, + tag_width=10 + ) ++ + ++ # Initialize crawler strategy + self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( +- logger = self.logger, +- **kwargs ++ browser_config=browser_config, ++ logger=self.logger, ++ **kwargs # Pass remaining kwargs for backwards compatibility + ) + +- # Handle deprecated parameter ++ # Handle deprecated cache parameter + if always_by_pass_cache is not None: + if kwargs.get("warning", True): + warnings.warn( +- "'always_by_pass_cache' is deprecated and will be removed in version X.X.X. " ++ "'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. " + "Use 'always_bypass_cache' instead. " + "Pass warning=False to suppress this warning.", + DeprecationWarning, +@@ -106,13 +115,15 @@ class AsyncWebCrawler: + else: + self.always_bypass_cache = always_bypass_cache + ++ # Thread safety setup + self._lock = asyncio.Lock() if thread_safe else None + ++ # Initialize directories + self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai") + os.makedirs(self.crawl4ai_folder, exist_ok=True) + os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) ++ + self.ready = False +- self.verbose = kwargs.get("verbose", False) + + async def __aenter__(self): + await self.crawler_strategy.__aenter__() +@@ -131,197 +142,198 @@ class AsyncWebCrawler: + self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") + self.ready = True + +- async def arun( +- self, +- url: str, +- word_count_threshold=MIN_WORD_THRESHOLD, +- extraction_strategy: ExtractionStrategy = None, +- chunking_strategy: ChunkingStrategy = RegexChunking(), +- content_filter: RelevantContentFilter = None, +- cache_mode: Optional[CacheMode] = None, +- # Deprecated parameters +- bypass_cache: bool = False, +- disable_cache: bool = False, +- no_cache_read: bool = False, +- no_cache_write: bool = False, +- # Other parameters +- css_selector: str = None, +- screenshot: bool = False, +- pdf: bool = False, +- user_agent: str = None, +- verbose=True, +- **kwargs, +- ) -> CrawlResult: +- """ +- Runs the crawler for a single source: URL (web, local file, or raw HTML). + +- Migration from legacy cache parameters: ++ async def arun( ++ self, ++ url: str, ++ config: Optional[CrawlerRunConfig] = None, ++ # Legacy parameters maintained for backwards compatibility ++ word_count_threshold=MIN_WORD_THRESHOLD, ++ extraction_strategy: ExtractionStrategy = None, ++ chunking_strategy: ChunkingStrategy = RegexChunking(), ++ content_filter: RelevantContentFilter = None, ++ cache_mode: Optional[CacheMode] = None, ++ # Deprecated cache parameters ++ bypass_cache: bool = False, ++ disable_cache: bool = False, ++ no_cache_read: bool = False, ++ no_cache_write: bool = False, ++ # Other legacy parameters ++ css_selector: str = None, ++ screenshot: bool = False, ++ pdf: bool = False, ++ user_agent: str = None, ++ verbose=True, ++ **kwargs, ++ ) -> CrawlResult: ++ """ ++ Runs the crawler for a single source: URL (web, local file, or raw HTML). ++ ++ Migration Guide: + Old way (deprecated): +- await crawler.arun(url, bypass_cache=True, no_cache_read=True) ++ result = await crawler.arun( ++ url="https://example.com", ++ word_count_threshold=200, ++ screenshot=True, ++ ... ++ ) + +- New way: +- await crawler.arun(url, cache_mode=CacheMode.BYPASS) ++ New way (recommended): ++ config = CrawlerRunConfig( ++ word_count_threshold=200, ++ screenshot=True, ++ ... ++ ) ++ result = await crawler.arun(url="https://example.com", crawler_config=config) + +- Args: +- url: The URL to crawl (http://, https://, file://, or raw:) +- cache_mode: Cache behavior control (recommended) +- word_count_threshold: Minimum word count threshold +- extraction_strategy: Strategy for content extraction +- chunking_strategy: Strategy for content chunking +- css_selector: CSS selector for content extraction +- screenshot: Whether to capture screenshot +- user_agent: Custom user agent +- verbose: Enable verbose logging ++ Args: ++ url: The URL to crawl (http://, https://, file://, or raw:) ++ crawler_config: Configuration object controlling crawl behavior ++ [other parameters maintained for backwards compatibility] + +- Deprecated Args: +- bypass_cache: Use cache_mode=CacheMode.BYPASS instead +- disable_cache: Use cache_mode=CacheMode.DISABLED instead +- no_cache_read: Use cache_mode=CacheMode.WRITE_ONLY instead +- no_cache_write: Use cache_mode=CacheMode.READ_ONLY instead +- +- Returns: +- CrawlResult: The result of crawling and processing +- """ +- # Check if url is not string and is not empty +- if not isinstance(url, str) or not url: +- raise ValueError("Invalid URL, make sure the URL is a non-empty string") +- +- async with self._lock or self.nullcontext(): # Lock for thread safety previously -> nullcontext(): +- try: +- # Handle deprecated parameters +- if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): +- if kwargs.get("warning", True): +- warnings.warn( +- "Cache control boolean flags are deprecated and will be removed in version X.X.X. " +- "Use 'cache_mode' parameter instead. Examples:\n" +- "- For bypass_cache=True, use cache_mode=CacheMode.BYPASS\n" +- "- For disable_cache=True, use cache_mode=CacheMode.DISABLED\n" +- "- For no_cache_read=True, use cache_mode=CacheMode.WRITE_ONLY\n" +- "- For no_cache_write=True, use cache_mode=CacheMode.READ_ONLY\n" +- "Pass warning=False to suppress this warning.", +- DeprecationWarning, +- stacklevel=2 +- ) ++ Returns: ++ CrawlResult: The result of crawling and processing ++ """ ++ crawler_config = config ++ if not isinstance(url, str) or not url: ++ raise ValueError("Invalid URL, make sure the URL is a non-empty string") ++ ++ async with self._lock or self.nullcontext(): ++ try: ++ # Handle configuration ++ if crawler_config is not None: ++ if any(param is not None for param in [ ++ word_count_threshold, extraction_strategy, chunking_strategy, ++ content_filter, cache_mode, css_selector, screenshot, pdf ++ ]): ++ self.logger.warning( ++ message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.", ++ tag="WARNING" ++ ) ++ config = crawler_config ++ else: ++ # Merge all parameters into a single kwargs dict for config creation ++ config_kwargs = { ++ "word_count_threshold": word_count_threshold, ++ "extraction_strategy": extraction_strategy, ++ "chunking_strategy": chunking_strategy, ++ "content_filter": content_filter, ++ "cache_mode": cache_mode, ++ "bypass_cache": bypass_cache, ++ "disable_cache": disable_cache, ++ "no_cache_read": no_cache_read, ++ "no_cache_write": no_cache_write, ++ "css_selector": css_selector, ++ "screenshot": screenshot, ++ "pdf": pdf, ++ "verbose": verbose, ++ **kwargs ++ } ++ config = CrawlerRunConfig.from_kwargs(config_kwargs) ++ ++ # Handle deprecated cache parameters ++ if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): ++ if kwargs.get("warning", True): ++ warnings.warn( ++ "Cache control boolean flags are deprecated and will be removed in version 0.5.0. " ++ "Use 'cache_mode' parameter instead.", ++ DeprecationWarning, ++ stacklevel=2 ++ ) ++ ++ # Convert legacy parameters if cache_mode not provided ++ if config.cache_mode is None: ++ config.cache_mode = _legacy_to_cache_mode( ++ disable_cache=disable_cache, ++ bypass_cache=bypass_cache, ++ no_cache_read=no_cache_read, ++ no_cache_write=no_cache_write ++ ) + +- # Convert legacy parameters if cache_mode not provided +- if cache_mode is None: +- cache_mode = _legacy_to_cache_mode( +- disable_cache=disable_cache, +- bypass_cache=bypass_cache, +- no_cache_read=no_cache_read, +- no_cache_write=no_cache_write +- ) +- +- # Default to ENABLED if no cache mode specified +- if cache_mode is None: +- cache_mode = CacheMode.ENABLED +- +- # Create cache context +- cache_context = CacheContext(url, cache_mode, self.always_bypass_cache) +- +- extraction_strategy = extraction_strategy or NoExtractionStrategy() +- extraction_strategy.verbose = verbose +- if not isinstance(extraction_strategy, ExtractionStrategy): +- raise ValueError("Unsupported extraction strategy") +- if not isinstance(chunking_strategy, ChunkingStrategy): +- raise ValueError("Unsupported chunking strategy") +- +- word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD) +- +- async_response: AsyncCrawlResponse = None +- cached_result = None +- screenshot_data = None +- pdf_data = None +- extracted_content = None +- +- start_time = time.perf_counter() +- +- # Try to get cached result if appropriate +- if cache_context.should_read(): +- cached_result = await async_db_manager.aget_cached_url(url) +- +- if cached_result: +- html = sanitize_input_encode(cached_result.html) +- extracted_content = sanitize_input_encode(cached_result.extracted_content or "") +- if screenshot: ++ # Default to ENABLED if no cache mode specified ++ if config.cache_mode is None: ++ config.cache_mode = CacheMode.ENABLED ++ ++ # Create cache context ++ cache_context = CacheContext(url, config.cache_mode, self.always_bypass_cache) ++ ++ # Initialize processing variables ++ async_response: AsyncCrawlResponse = None ++ cached_result = None ++ screenshot_data = None ++ pdf_data = None ++ extracted_content = None ++ start_time = time.perf_counter() ++ ++ # Try to get cached result if appropriate ++ if cache_context.should_read(): ++ cached_result = await async_db_manager.aget_cached_url(url) ++ ++ if cached_result: ++ html = sanitize_input_encode(cached_result.html) ++ extracted_content = sanitize_input_encode(cached_result.extracted_content or "") ++ # If screenshot is requested but its not in cache, then set cache_result to None + screenshot_data = cached_result.screenshot +- if not screenshot_data: +- cached_result = None +- if pdf: + pdf_data = cached_result.pdf +- if not pdf_data: ++ if config.screenshot and not screenshot or config.pdf and not pdf: + cached_result = None +- # if verbose: +- # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s") +- self.logger.url_status( ++ ++ self.logger.url_status( + url=cache_context.display_url, + success=bool(html), + timing=time.perf_counter() - start_time, + tag="FETCH" +- ) ++ ) + ++ # Fetch fresh content if needed ++ if not cached_result or not html: ++ t1 = time.perf_counter() ++ ++ if user_agent: ++ self.crawler_strategy.update_user_agent(user_agent) ++ ++ # Pass config to crawl method ++ async_response = await self.crawler_strategy.crawl( ++ url, ++ config=config # Pass the entire config object ++ ) ++ ++ html = sanitize_input_encode(async_response.html) ++ screenshot_data = async_response.screenshot ++ pdf_data = async_response.pdf_data ++ ++ t2 = time.perf_counter() ++ self.logger.url_status( ++ url=cache_context.display_url, ++ success=bool(html), ++ timing=t2 - t1, ++ tag="FETCH" ++ ) + +- # Fetch fresh content if needed +- if not cached_result or not html: +- t1 = time.perf_counter() +- +- if user_agent: +- self.crawler_strategy.update_user_agent(user_agent) +- async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl( +- url, +- screenshot=screenshot, +- pdf=pdf, +- **kwargs +- ) +- html = sanitize_input_encode(async_response.html) +- screenshot_data = async_response.screenshot +- pdf_data = async_response.pdf_data +- t2 = time.perf_counter() +- self.logger.url_status( +- url=cache_context.display_url, +- success=bool(html), +- timing=t2 - t1, +- tag="FETCH" ++ # Process the HTML content ++ crawl_result = await self.aprocess_html( ++ url=url, ++ html=html, ++ extracted_content=extracted_content, ++ config=config, # Pass the config object instead of individual parameters ++ screenshot=screenshot_data, ++ pdf_data=pdf_data, ++ verbose=config.verbose + ) +- # if verbose: +- # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") +- +- # Process the HTML content +- crawl_result = await self.aprocess_html( +- url=url, +- html=html, +- extracted_content=extracted_content, +- word_count_threshold=word_count_threshold, +- extraction_strategy=extraction_strategy, +- chunking_strategy=chunking_strategy, +- content_filter=content_filter, +- css_selector=css_selector, +- screenshot=screenshot_data, +- pdf_data=pdf_data, +- verbose=verbose, +- is_cached=bool(cached_result), +- async_response=async_response, +- is_web_url=cache_context.is_web_url, +- is_local_file=cache_context.is_local_file, +- is_raw_html=cache_context.is_raw_html, +- **kwargs, +- ) +- +- # Set response data +- if async_response: +- crawl_result.status_code = async_response.status_code +- crawl_result.response_headers = async_response.response_headers +- crawl_result.downloaded_files = async_response.downloaded_files +- else: +- crawl_result.status_code = 200 +- crawl_result.response_headers = cached_result.response_headers if cached_result else {} + +- crawl_result.success = bool(html) +- crawl_result.session_id = kwargs.get("session_id", None) ++ # Set response data ++ if async_response: ++ crawl_result.status_code = async_response.status_code ++ crawl_result.response_headers = async_response.response_headers ++ crawl_result.downloaded_files = async_response.downloaded_files ++ else: ++ crawl_result.status_code = 200 ++ crawl_result.response_headers = cached_result.response_headers if cached_result else {} ++ ++ crawl_result.success = bool(html) ++ crawl_result.session_id = getattr(config, 'session_id', None) + +- # if verbose: +- # print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") +- self.logger.success( ++ self.logger.success( + message="{url:.50}... | Status: {status} | Total: {timing}", + tag="COMPLETE", + params={ +@@ -335,254 +347,312 @@ class AsyncWebCrawler: + } + ) + +- # Update cache if appropriate +- if cache_context.should_write() and not bool(cached_result): +- await async_db_manager.acache_url(crawl_result) ++ # Update cache if appropriate ++ if cache_context.should_write() and not bool(cached_result): ++ await async_db_manager.acache_url(crawl_result) ++ ++ return crawl_result + +- return crawl_result ++ except Exception as e: ++ error_context = get_error_context(sys.exc_info()) ++ ++ error_message = ( ++ f"Unexpected error in _crawl_web at line {error_context['line_no']} " ++ f"in {error_context['function']} ({error_context['filename']}):\n" ++ f"Error: {str(e)}\n\n" ++ f"Code context:\n{error_context['code_context']}" ++ ) ++ # if not hasattr(e, "msg"): ++ # e.msg = str(e) ++ ++ self.logger.error_status( ++ url=url, ++ error=create_box_message(error_message, type="error"), ++ tag="ERROR" ++ ) ++ ++ return CrawlResult( ++ url=url, ++ html="", ++ success=False, ++ error_message=error_message ++ ) ++ ++ async def aprocess_html( ++ self, ++ url: str, ++ html: str, ++ extracted_content: str, ++ config: CrawlerRunConfig, ++ screenshot: str, ++ pdf_data: str, ++ verbose: bool, ++ **kwargs, ++ ) -> CrawlResult: ++ """ ++ Process HTML content using the provided configuration. ++ ++ Args: ++ url: The URL being processed ++ html: Raw HTML content ++ extracted_content: Previously extracted content (if any) ++ config: Configuration object controlling processing behavior ++ screenshot: Screenshot data (if any) ++ verbose: Whether to enable verbose logging ++ **kwargs: Additional parameters for backwards compatibility + ++ Returns: ++ CrawlResult: Processed result containing extracted and formatted content ++ """ ++ try: ++ _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" ++ t1 = time.perf_counter() ++ ++ # Initialize scraping strategy ++ scrapping_strategy = WebScrapingStrategy(logger=self.logger) ++ ++ # Process HTML content ++ result = scrapping_strategy.scrap( ++ url, ++ html, ++ word_count_threshold=config.word_count_threshold, ++ css_selector=config.css_selector, ++ only_text=config.only_text, ++ image_description_min_word_threshold=config.image_description_min_word_threshold, ++ content_filter=config.content_filter ++ ) ++ ++ if result is None: ++ raise ValueError(f"Process HTML, Failed to extract content from the website: {url}") ++ ++ except InvalidCSSSelectorError as e: ++ raise ValueError(str(e)) + except Exception as e: +- if not hasattr(e, "msg"): +- e.msg = str(e) +- # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") ++ raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") ++ ++ # Extract results ++ markdown_v2 = result.get("markdown_v2", None) ++ cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) ++ markdown = sanitize_input_encode(result.get("markdown", "")) ++ fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) ++ fit_html = sanitize_input_encode(result.get("fit_html", "")) ++ media = result.get("media", []) ++ links = result.get("links", []) ++ metadata = result.get("metadata", {}) ++ ++ # Log processing completion ++ self.logger.info( ++ message="Processed {url:.50}... | Time: {timing}ms", ++ tag="SCRAPE", ++ params={ ++ "url": _url, ++ "timing": int((time.perf_counter() - t1) * 1000) ++ } ++ ) ++ ++ # Handle content extraction if needed ++ if (extracted_content is None and ++ config.extraction_strategy and ++ config.chunking_strategy and ++ not isinstance(config.extraction_strategy, NoExtractionStrategy)): + +- self.logger.error_status( +- # url=cache_context.display_url, +- url=url, +- error=create_box_message(e.msg, type = "error"), +- tag="ERROR" +- ) +- return CrawlResult( +- url=url, +- html="", +- success=False, +- error_message=e.msg ++ t1 = time.perf_counter() ++ ++ # Handle different extraction strategy types ++ if isinstance(config.extraction_strategy, (JsonCssExtractionStrategy, JsonCssExtractionStrategy)): ++ config.extraction_strategy.verbose = verbose ++ extracted_content = config.extraction_strategy.run(url, [html]) ++ extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) ++ else: ++ sections = config.chunking_strategy.chunk(markdown) ++ extracted_content = config.extraction_strategy.run(url, sections) ++ extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) ++ ++ # Log extraction completion ++ self.logger.info( ++ message="Completed for {url:.50}... | Time: {timing}s", ++ tag="EXTRACT", ++ params={ ++ "url": _url, ++ "timing": time.perf_counter() - t1 ++ } + ) + +- async def arun_many( +- self, +- urls: List[str], +- word_count_threshold=MIN_WORD_THRESHOLD, +- extraction_strategy: ExtractionStrategy = None, +- chunking_strategy: ChunkingStrategy = RegexChunking(), +- content_filter: RelevantContentFilter = None, +- cache_mode: Optional[CacheMode] = None, +- # Deprecated parameters +- bypass_cache: bool = False, +- css_selector: str = None, +- screenshot: bool = False, +- pdf: bool = False, +- user_agent: str = None, +- verbose=True, +- **kwargs, +- ) -> List[CrawlResult]: +- """ +- Runs the crawler for multiple URLs concurrently. ++ # Handle screenshot and PDF data ++ screenshot_data = None if not screenshot else screenshot ++ pdf_data = None if not pdf_data else pdf_data ++ ++ # Apply HTML formatting if requested ++ if config.prettiify: ++ cleaned_html = fast_format_html(cleaned_html) ++ ++ # Return complete crawl result ++ return CrawlResult( ++ url=url, ++ html=html, ++ cleaned_html=cleaned_html, ++ markdown_v2=markdown_v2, ++ markdown=markdown, ++ fit_markdown=fit_markdown, ++ fit_html=fit_html, ++ media=media, ++ links=links, ++ metadata=metadata, ++ screenshot=screenshot_data, ++ pdf=pdf_data, ++ extracted_content=extracted_content, ++ success=True, ++ error_message="", ++ ) + +- Migration from legacy parameters: ++ async def arun_many( ++ self, ++ urls: List[str], ++ config: Optional[CrawlerRunConfig] = None, ++ # Legacy parameters maintained for backwards compatibility ++ word_count_threshold=MIN_WORD_THRESHOLD, ++ extraction_strategy: ExtractionStrategy = None, ++ chunking_strategy: ChunkingStrategy = RegexChunking(), ++ content_filter: RelevantContentFilter = None, ++ cache_mode: Optional[CacheMode] = None, ++ bypass_cache: bool = False, ++ css_selector: str = None, ++ screenshot: bool = False, ++ pdf: bool = False, ++ user_agent: str = None, ++ verbose=True, ++ **kwargs, ++ ) -> List[CrawlResult]: ++ """ ++ Runs the crawler for multiple URLs concurrently. ++ ++ Migration Guide: + Old way (deprecated): +- results = await crawler.arun_many(urls, bypass_cache=True) ++ results = await crawler.arun_many( ++ urls, ++ word_count_threshold=200, ++ screenshot=True, ++ ... ++ ) + +- New way: +- results = await crawler.arun_many(urls, cache_mode=CacheMode.BYPASS) +- +- Args: +- urls: List of URLs to crawl +- cache_mode: Cache behavior control (recommended) +- [other parameters same as arun()] +- +- Returns: +- List[CrawlResult]: Results for each URL +- """ +- if bypass_cache: +- if kwargs.get("warning", True): +- warnings.warn( +- "'bypass_cache' is deprecated and will be removed in version X.X.X. " +- "Use 'cache_mode=CacheMode.BYPASS' instead. " +- "Pass warning=False to suppress this warning.", +- DeprecationWarning, +- stacklevel=2 ++ New way (recommended): ++ config = CrawlerRunConfig( ++ word_count_threshold=200, ++ screenshot=True, ++ ... + ) +- if cache_mode is None: +- cache_mode = CacheMode.BYPASS +- +- semaphore_count = kwargs.get('semaphore_count', 10) +- semaphore = asyncio.Semaphore(semaphore_count) ++ results = await crawler.arun_many(urls, crawler_config=config) + +- async def crawl_with_semaphore(url): +- domain = urlparse(url).netloc +- current_time = time.time() +- +- # print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}") +- self.logger.debug( +- message="Started task for {url:.50}...", +- tag="PARALLEL", +- params={"url": url} +- ) +- +- # Get delay settings from kwargs or use defaults +- mean_delay = kwargs.get('mean_delay', 0.1) # 0.5 seconds default mean delay +- max_range = kwargs.get('max_range', 0.3) # 1 seconds default max additional delay +- +- # Check if we need to wait +- if domain in self._domain_last_hit: +- time_since_last = current_time - self._domain_last_hit[domain] +- if time_since_last < mean_delay: +- delay = mean_delay + random.uniform(0, max_range) +- await asyncio.sleep(delay) ++ Args: ++ urls: List of URLs to crawl ++ crawler_config: Configuration object controlling crawl behavior for all URLs ++ [other parameters maintained for backwards compatibility] + +- # Update last hit time +- self._domain_last_hit[domain] = current_time +- +- async with semaphore: +- return await self.arun( +- url, +- word_count_threshold=word_count_threshold, +- extraction_strategy=extraction_strategy, +- chunking_strategy=chunking_strategy, +- content_filter=content_filter, +- cache_mode=cache_mode, +- css_selector=css_selector, +- screenshot=screenshot, +- user_agent=user_agent, +- verbose=verbose, +- **kwargs, +- ) ++ Returns: ++ List[CrawlResult]: Results for each URL ++ """ ++ crawler_config = config ++ # Handle configuration ++ if crawler_config is not None: ++ if any(param is not None for param in [ ++ word_count_threshold, extraction_strategy, chunking_strategy, ++ content_filter, cache_mode, css_selector, screenshot, pdf ++ ]): ++ self.logger.warning( ++ message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.", ++ tag="WARNING" ++ ) ++ config = crawler_config ++ else: ++ # Merge all parameters into a single kwargs dict for config creation ++ config_kwargs = { ++ "word_count_threshold": word_count_threshold, ++ "extraction_strategy": extraction_strategy, ++ "chunking_strategy": chunking_strategy, ++ "content_filter": content_filter, ++ "cache_mode": cache_mode, ++ "bypass_cache": bypass_cache, ++ "css_selector": css_selector, ++ "screenshot": screenshot, ++ "pdf": pdf, ++ "verbose": verbose, ++ **kwargs ++ } ++ config = CrawlerRunConfig.from_kwargs(config_kwargs) ++ ++ if bypass_cache: ++ if kwargs.get("warning", True): ++ warnings.warn( ++ "'bypass_cache' is deprecated and will be removed in version 0.5.0. " ++ "Use 'cache_mode=CacheMode.BYPASS' instead. " ++ "Pass warning=False to suppress this warning.", ++ DeprecationWarning, ++ stacklevel=2 ++ ) ++ if config.cache_mode is None: ++ config.cache_mode = CacheMode.BYPASS + +- # Print start message +- # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}") +- self.logger.info( +- message="Starting concurrent crawling for {count} URLs...", +- tag="INIT", +- params={"count": len(urls)} +- ) +- start_time = time.perf_counter() +- tasks = [crawl_with_semaphore(url) for url in urls] +- results = await asyncio.gather(*tasks, return_exceptions=True) +- end_time = time.perf_counter() +- # print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}") +- self.logger.success( +- message="Concurrent crawling completed for {count} URLs | " + Fore.YELLOW + " Total time: {timing}" + Style.RESET_ALL, +- tag="COMPLETE", +- params={ +- "count": len(urls), +- "timing": f"{end_time - start_time:.2f}s" +- }, +- colors={"timing": Fore.YELLOW} +- ) +- return [result if not isinstance(result, Exception) else str(result) for result in results] ++ semaphore_count = config.semaphore_count or 5 ++ semaphore = asyncio.Semaphore(semaphore_count) + ++ async def crawl_with_semaphore(url): ++ # Handle rate limiting per domain ++ domain = urlparse(url).netloc ++ current_time = time.time() ++ ++ self.logger.debug( ++ message="Started task for {url:.50}...", ++ tag="PARALLEL", ++ params={"url": url} ++ ) + +- async def aprocess_html( +- self, +- url: str, +- html: str, +- extracted_content: str, +- word_count_threshold: int, +- extraction_strategy: ExtractionStrategy, +- chunking_strategy: ChunkingStrategy, +- content_filter: RelevantContentFilter, +- css_selector: str, +- screenshot: str, +- verbose: bool, +- **kwargs, +- ) -> CrawlResult: +- # Extract content from HTML +- try: +- _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" +- t1 = time.perf_counter() +- scrapping_strategy = WebScrapingStrategy( +- logger=self.logger, +- ) +- # result = await scrapping_strategy.ascrap( +- result = scrapping_strategy.scrap( +- url, +- html, +- word_count_threshold=word_count_threshold, +- css_selector=css_selector, +- only_text=kwargs.pop("only_text", False), +- image_description_min_word_threshold=kwargs.pop( +- "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD +- ), +- content_filter = content_filter, +- **kwargs, +- ) ++ # Get delay settings from config ++ mean_delay = config.mean_delay ++ max_range = config.max_range ++ ++ # Apply rate limiting ++ if domain in self._domain_last_hit: ++ time_since_last = current_time - self._domain_last_hit[domain] ++ if time_since_last < mean_delay: ++ delay = mean_delay + random.uniform(0, max_range) ++ await asyncio.sleep(delay) ++ ++ self._domain_last_hit[domain] = current_time + +- if result is None: +- raise ValueError(f"Process HTML, Failed to extract content from the website: {url}") +- except InvalidCSSSelectorError as e: +- raise ValueError(str(e)) +- except Exception as e: +- raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") ++ async with semaphore: ++ return await self.arun( ++ url, ++ crawler_config=config, # Pass the entire config object ++ user_agent=user_agent # Maintain user_agent override capability ++ ) + +- markdown_v2: MarkdownGenerationResult = result.get("markdown_v2", None) +- +- cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) +- markdown = sanitize_input_encode(result.get("markdown", "")) +- fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) +- fit_html = sanitize_input_encode(result.get("fit_html", "")) +- media = result.get("media", []) +- links = result.get("links", []) +- metadata = result.get("metadata", {}) +- +- # if verbose: +- # print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms") +- self.logger.info( +- message="Processed {url:.50}... | Time: {timing}ms", +- tag="SCRAPE", +- params={ +- "url": _url, +- "timing": int((time.perf_counter() - t1) * 1000) +- } +- ) ++ # Log start of concurrent crawling ++ self.logger.info( ++ message="Starting concurrent crawling for {count} URLs...", ++ tag="INIT", ++ params={"count": len(urls)} ++ ) + ++ # Execute concurrent crawls ++ start_time = time.perf_counter() ++ tasks = [crawl_with_semaphore(url) for url in urls] ++ results = await asyncio.gather(*tasks, return_exceptions=True) ++ end_time = time.perf_counter() + +- if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy): +- t1 = time.perf_counter() +- # Check if extraction strategy is type of JsonCssExtractionStrategy +- if isinstance(extraction_strategy, JsonCssExtractionStrategy) or isinstance(extraction_strategy, JsonCssExtractionStrategy): +- extraction_strategy.verbose = verbose +- extracted_content = extraction_strategy.run(url, [html]) +- extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) +- else: +- sections = chunking_strategy.chunk(markdown) +- extracted_content = extraction_strategy.run(url, sections) +- extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) +- # if verbose: +- # print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}") +- self.logger.info( +- message="Completed for {url:.50}... | Time: {timing}s", +- tag="EXTRACT", ++ # Log completion ++ self.logger.success( ++ message="Concurrent crawling completed for {count} URLs | Total time: {timing}", ++ tag="COMPLETE", + params={ +- "url": _url, +- "timing": time.perf_counter() - t1 ++ "count": len(urls), ++ "timing": f"{end_time - start_time:.2f}s" ++ }, ++ colors={ ++ "timing": Fore.YELLOW + } + ) + +- screenshot = None if not screenshot else screenshot +- pdf_data = kwargs.get("pdf_data", None) +- +- +- if kwargs.get("prettiify", False): +- cleaned_html = fast_format_html(cleaned_html) +- +- return CrawlResult( +- url=url, +- html=html, +- cleaned_html=cleaned_html, +- markdown_v2=markdown_v2, +- markdown=markdown, +- fit_markdown=fit_markdown, +- fit_html= fit_html, +- media=media, +- links=links, +- metadata=metadata, +- screenshot=screenshot, +- pdf=pdf_data, +- extracted_content=extracted_content, +- success=True, +- error_message="", +- ) ++ return [result if not isinstance(result, Exception) else str(result) for result in results] + + async def aclear_cache(self): + """Clear the cache database.""" +diff --git a/crawl4ai/config.py b/crawl4ai/config.py +index e17ff34..7c8a931 100644 +--- a/crawl4ai/config.py ++++ b/crawl4ai/config.py +@@ -57,4 +57,6 @@ MAX_METRICS_HISTORY = 1000 + NEED_MIGRATION = True + URL_LOG_SHORTEN_LENGTH = 30 + SHOW_DEPRECATION_WARNINGS = True +-SCREENSHOT_HEIGHT_TRESHOLD = 10000 +\ No newline at end of file ++SCREENSHOT_HEIGHT_TRESHOLD = 10000 ++PAGE_TIMEOUT=60000 ++DOWNLOAD_PAGE_TIMEOUT=60000 +\ No newline at end of file +diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py +index 8a12ff0..7ecc22d 100644 +--- a/crawl4ai/utils.py ++++ b/crawl4ai/utils.py +@@ -29,7 +29,7 @@ class InvalidCSSSelectorError(Exception): + def create_box_message( + message: str, + type: str = "info", +- width: int = 80, ++ width: int = 120, + add_newlines: bool = True, + double_line: bool = False + ) -> str: +@@ -1223,7 +1223,8 @@ def ensure_content_dirs(base_path: str) -> Dict[str, str]: + 'cleaned': 'cleaned_html', + 'markdown': 'markdown_content', + 'extracted': 'extracted_content', +- 'screenshots': 'screenshots' ++ 'screenshots': 'screenshots', ++ 'screenshot': 'screenshots' + } + + content_paths = {} +@@ -1232,4 +1233,60 @@ def ensure_content_dirs(base_path: str) -> Dict[str, str]: + os.makedirs(path, exist_ok=True) + content_paths[key] = path + +- return content_paths +\ No newline at end of file ++ return content_paths ++ ++def get_error_context(exc_info, context_lines: int = 5): ++ """ ++ Extract error context with more reliable line number tracking. ++ ++ Args: ++ exc_info: The exception info from sys.exc_info() ++ context_lines: Number of lines to show before and after the error ++ ++ Returns: ++ dict: Error context information ++ """ ++ import traceback ++ import linecache ++ import os ++ ++ # Get the full traceback ++ tb = traceback.extract_tb(exc_info[2]) ++ ++ # Get the last frame (where the error occurred) ++ last_frame = tb[-1] ++ filename = last_frame.filename ++ line_no = last_frame.lineno ++ func_name = last_frame.name ++ ++ # Get the source code context using linecache ++ # This is more reliable than inspect.getsourcelines ++ context_start = max(1, line_no - context_lines) ++ context_end = line_no + context_lines + 1 ++ ++ # Build the context lines with line numbers ++ context_lines = [] ++ for i in range(context_start, context_end): ++ line = linecache.getline(filename, i) ++ if line: ++ # Remove any trailing whitespace/newlines and add the pointer for error line ++ line = line.rstrip() ++ pointer = '→' if i == line_no else ' ' ++ context_lines.append(f"{i:4d} {pointer} {line}") ++ ++ # Join the lines with newlines ++ code_context = '\n'.join(context_lines) ++ ++ # Get relative path for cleaner output ++ try: ++ rel_path = os.path.relpath(filename) ++ except ValueError: ++ # Fallback if relpath fails (can happen on Windows with different drives) ++ rel_path = filename ++ ++ return { ++ "filename": rel_path, ++ "line_no": line_no, ++ "function": func_name, ++ "code_context": code_context ++ } +\ No newline at end of file +diff --git a/docs/md_v2/basic/cache-modes.md b/docs/md_v2/basic/cache-modes.md +index 04a4f21..01cfe34 100644 +--- a/docs/md_v2/basic/cache-modes.md ++++ b/docs/md_v2/basic/cache-modes.md +@@ -1,7 +1,7 @@ + # Crawl4AI Cache System and Migration Guide + + ## Overview +-Starting from version X.X.X, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable. ++Starting from version 0.5.0, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable. + + ## Old vs New Approach + diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index cee7c25b..d297dfca 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -1,7 +1,11 @@ # __init__.py from .async_webcrawler import AsyncWebCrawler, CacheMode - +from .async_configs import BrowserConfig, CrawlerRunConfig +from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy +from .chunking_strategy import ChunkingStrategy, RegexChunking +from .markdown_generation_strategy import DefaultMarkdownGenerator +from .content_filter_strategy import PruningContentFilter, BM25ContentFilter from .models import CrawlResult from .__version__ import __version__ @@ -9,6 +13,17 @@ __all__ = [ "AsyncWebCrawler", "CrawlResult", "CacheMode", + 'BrowserConfig', + 'CrawlerRunConfig', + 'ExtractionStrategy', + 'LLMExtractionStrategy', + 'CosineStrategy', + 'JsonCssExtractionStrategy', + 'ChunkingStrategy', + 'RegexChunking', + 'DefaultMarkdownGenerator', + 'PruningContentFilter', + 'BM25ContentFilter', ] def is_sync_version_installed(): diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py new file mode 100644 index 00000000..41574fe6 --- /dev/null +++ b/crawl4ai/async_configs.py @@ -0,0 +1,402 @@ +from .config import ( + MIN_WORD_THRESHOLD, + IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, + SCREENSHOT_HEIGHT_TRESHOLD, + PAGE_TIMEOUT +) +from .user_agent_generator import UserAgentGenerator +from .extraction_strategy import ExtractionStrategy +from .chunking_strategy import ChunkingStrategy + +class BrowserConfig: + """ + Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy. + + This class centralizes all parameters that affect browser and context creation. Instead of passing + scattered keyword arguments, users can instantiate and modify this configuration object. The crawler + code will then reference these settings to initialize the browser in a consistent, documented manner. + + Attributes: + browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit". + Default: "chromium". + headless (bool): Whether to run the browser in headless mode (no visible GUI). + Default: True. + use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing + advanced manipulation. Default: False. + use_persistent_context (bool): Use a persistent browser context (like a persistent profile). + Automatically sets use_managed_browser=True. Default: False. + user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a + temporary directory may be used. Default: None. + chrome_channel (str): The Chrome channel to launch (e.g., "chrome", "msedge"). Only applies if browser_type + is "chromium". Default: "chrome". + proxy (str or None): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used. + Default: None. + proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}. + If None, no additional proxy config. Default: None. + viewport_width (int): Default viewport width for pages. Default: 1920. + viewport_height (int): Default viewport height for pages. Default: 1080. + verbose (bool): Enable verbose logging. + Default: True. + accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path. + Default: False. + downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True, + a default path will be created. Default: None. + storage_state (str or dict or None): Path or object describing storage state (cookies, localStorage). + Default: None. + ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True. + java_script_enabled (bool): Enable JavaScript execution in pages. Default: True. + cookies (list): List of cookies to add to the browser context. Each cookie is a dict with fields like + {"name": "...", "value": "...", "url": "..."}. + Default: []. + headers (dict): Extra HTTP headers to apply to all requests in this context. + Default: {}. + user_agent (str): Custom User-Agent string to use. Default: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36". + user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided + user_agent as-is. Default: None. + user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set. + Default: None. + text_only (bool): If True, disables images and other rich content for potentially faster load times. + Default: False. + light_mode (bool): Disables certain background features for performance gains. Default: False. + extra_args (list): Additional command-line arguments passed to the browser. + Default: []. + """ + + def __init__( + self, + browser_type: str = "chromium", + headless: bool = True, + use_managed_browser: bool = False, + use_persistent_context: bool = False, + user_data_dir: str = None, + chrome_channel: str = "chrome", + proxy: str = None, + proxy_config: dict = None, + viewport_width: int = 1920, + viewport_height: int = 1080, + accept_downloads: bool = False, + downloads_path: str = None, + storage_state=None, + ignore_https_errors: bool = True, + java_script_enabled: bool = True, + sleep_on_close: bool = False, + verbose: bool = True, + cookies: list = None, + headers: dict = None, + user_agent: str = ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" + ), + user_agent_mode: str = None, + user_agent_generator_config: dict = None, + text_only: bool = False, + light_mode: bool = False, + extra_args: list = None, + ): + self.browser_type = browser_type + self.headless = headless + self.use_managed_browser = use_managed_browser + self.use_persistent_context = use_persistent_context + self.user_data_dir = user_data_dir + if self.browser_type == "chromium": + self.chrome_channel = "chrome" + elif self.browser_type == "firefox": + self.chrome_channel = "firefox" + elif self.browser_type == "webkit": + self.chrome_channel = "webkit" + else: + self.chrome_channel = chrome_channel or "chrome" + self.proxy = proxy + self.proxy_config = proxy_config + self.viewport_width = viewport_width + self.viewport_height = viewport_height + self.accept_downloads = accept_downloads + self.downloads_path = downloads_path + self.storage_state = storage_state + self.ignore_https_errors = ignore_https_errors + self.java_script_enabled = java_script_enabled + self.cookies = cookies if cookies is not None else [] + self.headers = headers if headers is not None else {} + self.user_agent = user_agent + self.user_agent_mode = user_agent_mode + self.user_agent_generator_config = user_agent_generator_config + self.text_only = text_only + self.light_mode = light_mode + self.extra_args = extra_args if extra_args is not None else [] + self.sleep_on_close = sleep_on_close + self.verbose = verbose + + user_agenr_generator = UserAgentGenerator() + if self.user_agent_mode != "random": + self.user_agent = user_agenr_generator.generate( + **(self.user_agent_generator_config or {}) + ) + self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent) + self.headers.setdefault("sec-ch-ua", self.browser_hint) + + # If persistent context is requested, ensure managed browser is enabled + if self.use_persistent_context: + self.use_managed_browser = True + + @staticmethod + def from_kwargs(kwargs: dict) -> "BrowserConfig": + return BrowserConfig( + browser_type=kwargs.get("browser_type", "chromium"), + headless=kwargs.get("headless", True), + use_managed_browser=kwargs.get("use_managed_browser", False), + use_persistent_context=kwargs.get("use_persistent_context", False), + user_data_dir=kwargs.get("user_data_dir"), + chrome_channel=kwargs.get("chrome_channel", "chrome"), + proxy=kwargs.get("proxy"), + proxy_config=kwargs.get("proxy_config"), + viewport_width=kwargs.get("viewport_width", 1920), + viewport_height=kwargs.get("viewport_height", 1080), + accept_downloads=kwargs.get("accept_downloads", False), + downloads_path=kwargs.get("downloads_path"), + storage_state=kwargs.get("storage_state"), + ignore_https_errors=kwargs.get("ignore_https_errors", True), + java_script_enabled=kwargs.get("java_script_enabled", True), + cookies=kwargs.get("cookies", []), + headers=kwargs.get("headers", {}), + user_agent=kwargs.get("user_agent", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36" + ), + user_agent_mode=kwargs.get("user_agent_mode"), + user_agent_generator_config=kwargs.get("user_agent_generator_config"), + text_only=kwargs.get("text_only", False), + light_mode=kwargs.get("light_mode", False), + extra_args=kwargs.get("extra_args", []) + ) + + +class CrawlerRunConfig: + """ + Configuration class for controlling how the crawler runs each crawl operation. + This includes parameters for content extraction, page manipulation, waiting conditions, + caching, and other runtime behaviors. + + This centralizes parameters that were previously scattered as kwargs to `arun()` and related methods. + By using this class, you have a single place to understand and adjust the crawling options. + + Attributes: + word_count_threshold (int): Minimum word count threshold before processing content. + Default: MIN_WORD_THRESHOLD (typically 200). + extraction_strategy (ExtractionStrategy or None): Strategy to extract structured data from crawled pages. + Default: None (NoExtractionStrategy is used if None). + chunking_strategy (ChunkingStrategy): Strategy to chunk content before extraction. + Default: RegexChunking(). + content_filter (RelevantContentFilter or None): Optional filter to prune irrelevant content. + Default: None. + cache_mode (CacheMode or None): Defines how caching is handled. + If None, defaults to CacheMode.ENABLED internally. + Default: None. + session_id (str or None): Optional session ID to persist the browser context and the created + page instance. If the ID already exists, the crawler does not + create a new page and uses the current page to preserve the state; + if not, it creates a new page and context then stores it in + memory with the given session ID. + bypass_cache (bool): Legacy parameter, if True acts like CacheMode.BYPASS. + Default: False. + disable_cache (bool): Legacy parameter, if True acts like CacheMode.DISABLED. + Default: False. + no_cache_read (bool): Legacy parameter, if True acts like CacheMode.WRITE_ONLY. + Default: False. + no_cache_write (bool): Legacy parameter, if True acts like CacheMode.READ_ONLY. + Default: False. + css_selector (str or None): CSS selector to extract a specific portion of the page. + Default: None. + screenshot (bool): Whether to take a screenshot after crawling. + Default: False. + pdf (bool): Whether to generate a PDF of the page. + Default: False. + verbose (bool): Enable verbose logging. + Default: True. + only_text (bool): If True, attempt to extract text-only content where applicable. + Default: False. + image_description_min_word_threshold (int): Minimum words for image description extraction. + Default: IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD (e.g., 50). + prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output. + Default: False. + js_code (str or list of str or None): JavaScript code/snippets to run on the page. + Default: None. + wait_for (str or None): A CSS selector or JS condition to wait for before extracting content. + Default: None. + js_only (bool): If True, indicates subsequent calls are JS-driven updates, not full page loads. + Default: False. + wait_until (str): The condition to wait for when navigating, e.g. "domcontentloaded". + Default: "domcontentloaded". + page_timeout (int): Timeout in ms for page operations like navigation. + Default: 60000 (60 seconds). + ignore_body_visibility (bool): If True, ignore whether the body is visible before proceeding. + Default: True. + wait_for_images (bool): If True, wait for images to load before extracting content. + Default: True. + adjust_viewport_to_content (bool): If True, adjust viewport according to the page content dimensions. + Default: False. + scan_full_page (bool): If True, scroll through the entire page to load all content. + Default: False. + scroll_delay (float): Delay in seconds between scroll steps if scan_full_page is True. + Default: 0.2. + process_iframes (bool): If True, attempts to process and inline iframe content. + Default: False. + remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML. + Default: False. + delay_before_return_html (float): Delay in seconds before retrieving final HTML. + Default: 0.1. + log_console (bool): If True, log console messages from the page. + Default: False. + simulate_user (bool): If True, simulate user interactions (mouse moves, clicks) for anti-bot measures. + Default: False. + override_navigator (bool): If True, overrides navigator properties for more human-like behavior. + Default: False. + magic (bool): If True, attempts automatic handling of overlays/popups. + Default: False. + screenshot_wait_for (float or None): Additional wait time before taking a screenshot. + Default: None. + screenshot_height_threshold (int): Threshold for page height to decide screenshot strategy. + Default: SCREENSHOT_HEIGHT_TRESHOLD (from config, e.g. 20000). + mean_delay (float): Mean base delay between requests when calling arun_many. + Default: 0.1. + max_range (float): Max random additional delay range for requests in arun_many. + Default: 0.3. + # session_id and semaphore_count might be set at runtime, not needed as defaults here. + """ + + def __init__( + self, + word_count_threshold: int = MIN_WORD_THRESHOLD , + extraction_strategy : ExtractionStrategy=None, # Will default to NoExtractionStrategy if None + chunking_strategy : ChunkingStrategy= None, # Will default to RegexChunking if None + content_filter=None, + cache_mode=None, + session_id: str = None, + bypass_cache: bool = False, + disable_cache: bool = False, + no_cache_read: bool = False, + no_cache_write: bool = False, + css_selector: str = None, + screenshot: bool = False, + pdf: bool = False, + verbose: bool = True, + only_text: bool = False, + image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, + prettiify: bool = False, + js_code=None, + wait_for: str = None, + js_only: bool = False, + wait_until: str = "domcontentloaded", + page_timeout: int = PAGE_TIMEOUT, + ignore_body_visibility: bool = True, + wait_for_images: bool = True, + adjust_viewport_to_content: bool = False, + scan_full_page: bool = False, + scroll_delay: float = 0.2, + process_iframes: bool = False, + remove_overlay_elements: bool = False, + delay_before_return_html: float = 0.1, + log_console: bool = False, + simulate_user: bool = False, + override_navigator: bool = False, + magic: bool = False, + screenshot_wait_for: float = None, + screenshot_height_threshold: int = SCREENSHOT_HEIGHT_TRESHOLD, + mean_delay: float = 0.1, + max_range: float = 0.3, + semaphore_count: int = 5, + ): + self.word_count_threshold = word_count_threshold + self.extraction_strategy = extraction_strategy + self.chunking_strategy = chunking_strategy + self.content_filter = content_filter + self.cache_mode = cache_mode + self.session_id = session_id + self.bypass_cache = bypass_cache + self.disable_cache = disable_cache + self.no_cache_read = no_cache_read + self.no_cache_write = no_cache_write + self.css_selector = css_selector + self.screenshot = screenshot + self.pdf = pdf + self.verbose = verbose + self.only_text = only_text + self.image_description_min_word_threshold = image_description_min_word_threshold + self.prettiify = prettiify + self.js_code = js_code + self.wait_for = wait_for + self.js_only = js_only + self.wait_until = wait_until + self.page_timeout = page_timeout + self.ignore_body_visibility = ignore_body_visibility + self.wait_for_images = wait_for_images + self.adjust_viewport_to_content = adjust_viewport_to_content + self.scan_full_page = scan_full_page + self.scroll_delay = scroll_delay + self.process_iframes = process_iframes + self.remove_overlay_elements = remove_overlay_elements + self.delay_before_return_html = delay_before_return_html + self.log_console = log_console + self.simulate_user = simulate_user + self.override_navigator = override_navigator + self.magic = magic + self.screenshot_wait_for = screenshot_wait_for + self.screenshot_height_threshold = screenshot_height_threshold + self.mean_delay = mean_delay + self.max_range = max_range + self.semaphore_count = semaphore_count + + # Validate type of extraction strategy and chunking strategy if they are provided + if self.extraction_strategy is not None and not isinstance(self.extraction_strategy, ExtractionStrategy): + raise ValueError("extraction_strategy must be an instance of ExtractionStrategy") + if self.chunking_strategy is not None and not isinstance(self.chunking_strategy, ChunkingStrategy): + raise ValueError("chunking_strategy must be an instance of ChunkingStrategy") + + # Set default chunking strategy if None + if self.chunking_strategy is None: + from .chunking_strategy import RegexChunking + self.chunking_strategy = RegexChunking() + + + @staticmethod + def from_kwargs(kwargs: dict) -> "CrawlerRunConfig": + return CrawlerRunConfig( + word_count_threshold=kwargs.get("word_count_threshold", 200), + extraction_strategy=kwargs.get("extraction_strategy"), + chunking_strategy=kwargs.get("chunking_strategy"), + content_filter=kwargs.get("content_filter"), + cache_mode=kwargs.get("cache_mode"), + session_id=kwargs.get("session_id"), + bypass_cache=kwargs.get("bypass_cache", False), + disable_cache=kwargs.get("disable_cache", False), + no_cache_read=kwargs.get("no_cache_read", False), + no_cache_write=kwargs.get("no_cache_write", False), + css_selector=kwargs.get("css_selector"), + screenshot=kwargs.get("screenshot", False), + pdf=kwargs.get("pdf", False), + verbose=kwargs.get("verbose", True), + only_text=kwargs.get("only_text", False), + image_description_min_word_threshold=kwargs.get("image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD), + prettiify=kwargs.get("prettiify", False), + js_code=kwargs.get("js_code"), # If not provided here, will default inside constructor + wait_for=kwargs.get("wait_for"), + js_only=kwargs.get("js_only", False), + wait_until=kwargs.get("wait_until", "domcontentloaded"), + page_timeout=kwargs.get("page_timeout", 60000), + ignore_body_visibility=kwargs.get("ignore_body_visibility", True), + adjust_viewport_to_content=kwargs.get("adjust_viewport_to_content", False), + scan_full_page=kwargs.get("scan_full_page", False), + scroll_delay=kwargs.get("scroll_delay", 0.2), + process_iframes=kwargs.get("process_iframes", False), + remove_overlay_elements=kwargs.get("remove_overlay_elements", False), + delay_before_return_html=kwargs.get("delay_before_return_html", 0.1), + log_console=kwargs.get("log_console", False), + simulate_user=kwargs.get("simulate_user", False), + override_navigator=kwargs.get("override_navigator", False), + magic=kwargs.get("magic", False), + screenshot_wait_for=kwargs.get("screenshot_wait_for"), + screenshot_height_threshold=kwargs.get("screenshot_height_threshold", 20000), + mean_delay=kwargs.get("mean_delay", 0.1), + max_range=kwargs.get("max_range", 0.3), + semaphore_count=kwargs.get("semaphore_count", 5) + ) diff --git a/crawl4ai/async_crawler_strategy.current.py b/crawl4ai/async_crawler_strategy.current.py deleted file mode 100644 index 6302447c..00000000 --- a/crawl4ai/async_crawler_strategy.current.py +++ /dev/null @@ -1,1475 +0,0 @@ -import asyncio -import base64 -import time -from abc import ABC, abstractmethod -from typing import Callable, Dict, Any, List, Optional, Awaitable -import os, sys, shutil -import tempfile, subprocess -from playwright.async_api import async_playwright, Page, Browser, Error -from playwright.async_api import TimeoutError as PlaywrightTimeoutError -from io import BytesIO -from PIL import Image, ImageDraw, ImageFont -from pathlib import Path -from playwright.async_api import ProxySettings -from pydantic import BaseModel -import hashlib -import json -import uuid -from .models import AsyncCrawlResponse -from .utils import create_box_message -from .user_agent_generator import UserAgentGenerator -from playwright_stealth import StealthConfig, stealth_async - -stealth_config = StealthConfig( - webdriver=True, - chrome_app=True, - chrome_csi=True, - chrome_load_times=True, - chrome_runtime=True, - navigator_languages=True, - navigator_plugins=True, - navigator_permissions=True, - webgl_vendor=True, - outerdimensions=True, - navigator_hardware_concurrency=True, - media_codecs=True, -) - -BROWSER_DISABLE_OPTIONS = [ - "--disable-background-networking", - "--disable-background-timer-throttling", - "--disable-backgrounding-occluded-windows", - "--disable-breakpad", - "--disable-client-side-phishing-detection", - "--disable-component-extensions-with-background-pages", - "--disable-default-apps", - "--disable-extensions", - "--disable-features=TranslateUI", - "--disable-hang-monitor", - "--disable-ipc-flooding-protection", - "--disable-popup-blocking", - "--disable-prompt-on-repost", - "--disable-sync", - "--force-color-profile=srgb", - "--metrics-recording-only", - "--no-first-run", - "--password-store=basic", - "--use-mock-keychain" -] - - -class ManagedBrowser: - def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None, host: str = "localhost", debugging_port: int = 9222): - self.browser_type = browser_type - self.user_data_dir = user_data_dir - self.headless = headless - self.browser_process = None - self.temp_dir = None - self.debugging_port = debugging_port - self.host = host - self.logger = logger - self.shutting_down = False - - async def start(self) -> str: - """ - Starts the browser process and returns the CDP endpoint URL. - If user_data_dir is not provided, creates a temporary directory. - """ - - # Create temp dir if needed - if not self.user_data_dir: - self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-") - self.user_data_dir = self.temp_dir - - # Get browser path and args based on OS and browser type - browser_path = self._get_browser_path() - args = self._get_browser_args() - - # Start browser process - try: - self.browser_process = subprocess.Popen( - args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE - ) - # Monitor browser process output for errors - asyncio.create_task(self._monitor_browser_process()) - await asyncio.sleep(2) # Give browser time to start - return f"http://{self.host}:{self.debugging_port}" - except Exception as e: - await self.cleanup() - raise Exception(f"Failed to start browser: {e}") - - async def _monitor_browser_process(self): - """Monitor the browser process for unexpected termination.""" - if self.browser_process: - try: - stdout, stderr = await asyncio.gather( - asyncio.to_thread(self.browser_process.stdout.read), - asyncio.to_thread(self.browser_process.stderr.read) - ) - - # Check shutting_down flag BEFORE logging anything - if self.browser_process.poll() is not None: - if not self.shutting_down: - self.logger.error( - message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", - tag="ERROR", - params={ - "code": self.browser_process.returncode, - "stdout": stdout.decode(), - "stderr": stderr.decode() - } - ) - await self.cleanup() - else: - self.logger.info( - message="Browser process terminated normally | Code: {code}", - tag="INFO", - params={"code": self.browser_process.returncode} - ) - except Exception as e: - if not self.shutting_down: - self.logger.error( - message="Error monitoring browser process: {error}", - tag="ERROR", - params={"error": str(e)} - ) - - def _get_browser_path(self) -> str: - """Returns the browser executable path based on OS and browser type""" - if sys.platform == "darwin": # macOS - paths = { - "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", - "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox", - "webkit": "/Applications/Safari.app/Contents/MacOS/Safari" - } - elif sys.platform == "win32": # Windows - paths = { - "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", - "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe", - "webkit": None # WebKit not supported on Windows - } - else: # Linux - paths = { - "chromium": "google-chrome", - "firefox": "firefox", - "webkit": None # WebKit not supported on Linux - } - - return paths.get(self.browser_type) - - def _get_browser_args(self) -> List[str]: - """Returns browser-specific command line arguments""" - base_args = [self._get_browser_path()] - - if self.browser_type == "chromium": - args = [ - f"--remote-debugging-port={self.debugging_port}", - f"--user-data-dir={self.user_data_dir}", - ] - if self.headless: - args.append("--headless=new") - elif self.browser_type == "firefox": - args = [ - "--remote-debugging-port", str(self.debugging_port), - "--profile", self.user_data_dir, - ] - if self.headless: - args.append("--headless") - else: - raise NotImplementedError(f"Browser type {self.browser_type} not supported") - - return base_args + args - - async def cleanup(self): - """Cleanup browser process and temporary directory""" - # Set shutting_down flag BEFORE any termination actions - self.shutting_down = True - - if self.browser_process: - try: - self.browser_process.terminate() - # Wait for process to end gracefully - for _ in range(10): # 10 attempts, 100ms each - if self.browser_process.poll() is not None: - break - await asyncio.sleep(0.1) - - # Force kill if still running - if self.browser_process.poll() is None: - self.browser_process.kill() - await asyncio.sleep(0.1) # Brief wait for kill to take effect - - except Exception as e: - self.logger.error( - message="Error terminating browser: {error}", - tag="ERROR", - params={"error": str(e)} - ) - - if self.temp_dir and os.path.exists(self.temp_dir): - try: - shutil.rmtree(self.temp_dir) - except Exception as e: - self.logger.error( - message="Error removing temporary directory: {error}", - tag="ERROR", - params={"error": str(e)} - ) - - -class AsyncCrawlerStrategy(ABC): - @abstractmethod - async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: - pass - - @abstractmethod - async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: - pass - - @abstractmethod - async def take_screenshot(self, **kwargs) -> str: - pass - - @abstractmethod - def update_user_agent(self, user_agent: str): - pass - - @abstractmethod - def set_hook(self, hook_type: str, hook: Callable): - pass - -class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - def __init__(self, use_cached_html=False, js_code=None, logger = None, **kwargs): - self.text_only = kwargs.get("text_only", False) - self.light_mode = kwargs.get("light_mode", False) - self.logger = logger - self.use_cached_html = use_cached_html - self.viewport_width = kwargs.get("viewport_width", 800 if self.text_only else 1920) - self.viewport_height = kwargs.get("viewport_height", 600 if self.text_only else 1080) - - if self.text_only: - self.extra_args = kwargs.get("extra_args", []) + [ - '--disable-images', - '--disable-javascript', - '--disable-gpu', - '--disable-software-rasterizer', - '--disable-dev-shm-usage' - ] - - self.user_agent = kwargs.get( - "user_agent", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" - # "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36" - ) - user_agenr_generator = UserAgentGenerator() - if kwargs.get("user_agent_mode") == "random": - self.user_agent = user_agenr_generator.generate( - **kwargs.get("user_agent_generator_config", {}) - ) - self.proxy = kwargs.get("proxy") - self.proxy_config = kwargs.get("proxy_config") - self.headless = kwargs.get("headless", True) - self.browser_type = kwargs.get("browser_type", "chromium") - self.headers = kwargs.get("headers", {}) - self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent) - self.headers.setdefault("sec-ch-ua", self.browser_hint) - self.cookies = kwargs.get("cookies", []) - self.storage_state = kwargs.get("storage_state", None) - self.sessions = {} - self.session_ttl = 1800 - self.js_code = js_code - self.verbose = kwargs.get("verbose", False) - self.playwright = None - self.browser = None - self.sleep_on_close = kwargs.get("sleep_on_close", False) - self.use_managed_browser = kwargs.get("use_managed_browser", False) - self.user_data_dir = kwargs.get("user_data_dir", None) - self.use_persistent_context = kwargs.get("use_persistent_context", False) - self.chrome_channel = kwargs.get("chrome_channel", "chrome") - self.managed_browser = None - self.default_context = None - self.hooks = { - 'on_browser_created': None, - 'on_user_agent_updated': None, - 'on_execution_started': None, - 'before_goto': None, - 'after_goto': None, - 'before_return_html': None, - 'before_retrieve_html': None - } - self.extra_args = kwargs.get("extra_args", []) - self.ignore_https_errors = kwargs.get("ignore_https_errors", True) - self.java_script_enabled = kwargs.get("java_script_enabled", True) - self.accept_downloads = kwargs.get("accept_downloads", False) - self.downloads_path = kwargs.get("downloads_path") - self._downloaded_files = [] # Track downloaded files for current crawl - if self.accept_downloads and not self.downloads_path: - self.downloads_path = os.path.join(os.getcwd(), "downloads") - os.makedirs(self.downloads_path, exist_ok=True) - - - async def __aenter__(self): - await self.start() - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb): - await self.close() - - async def start(self): - if self.playwright is None: - self.playwright = await async_playwright().start() - if self.browser is None: - if self.use_managed_browser: - # Use managed browser approach - self.managed_browser = ManagedBrowser( - browser_type=self.browser_type, - user_data_dir=self.user_data_dir, - headless=self.headless, - logger=self.logger - ) - cdp_url = await self.managed_browser.start() - self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) - - # Get the default context that maintains the user profile - contexts = self.browser.contexts - if contexts: - self.default_context = contexts[0] - else: - # If no default context exists, create one - self.default_context = await self.browser.new_context( - viewport={"width": self.viewport_width, "height": self.viewport_height}, - storage_state=self.storage_state, - user_agent= self.user_agent, - accept_downloads=self.accept_downloads, - ignore_https_errors=self.ignore_https_errors, - java_script_enabled=self.java_script_enabled, - ) - - # Set up the default context - if self.default_context: - await self.default_context.set_extra_http_headers(self.headers) - if self.cookies: - await self.default_context.add_cookies(self.cookies) - if self.storage_state: - # If storage_state is a dictionary or file path, Playwright will handle it. - await self.default_context.storage_state(path=None) # Just ensuring default_context is ready - if self.accept_downloads: - await self.default_context.set_default_timeout(60000) - await self.default_context.set_default_navigation_timeout(60000) - self.default_context._impl_obj._options["accept_downloads"] = True - self.default_context._impl_obj._options["downloads_path"] = self.downloads_path - - if self.user_agent: - await self.default_context.set_extra_http_headers({ - "User-Agent": self.user_agent, - "sec-ch-ua": self.browser_hint, - # **self.headers - }) - else: - # Base browser arguments - browser_args = { - "headless": self.headless, - "args": [ - "--no-sandbox", - "--disable-dev-shm-usage", - "--no-first-run", - "--no-default-browser-check", - "--disable-infobars", - "--window-position=0,0", - "--ignore-certificate-errors", - "--ignore-certificate-errors-spki-list", - "--disable-blink-features=AutomationControlled", - "--window-position=400,0", - f"--window-size={self.viewport_width},{self.viewport_height}", - ] - } - - if self.light_mode: - browser_args["args"].extend(BROWSER_DISABLE_OPTIONS) - - if self.text_only: - browser_args["args"].extend([ - '--blink-settings=imagesEnabled=false', - '--disable-remote-fonts' - ]) - - # Add channel if specified (try Chrome first) - if self.chrome_channel: - browser_args["channel"] = self.chrome_channel - - # Add extra args if provided - if self.extra_args: - browser_args["args"].extend(self.extra_args) - - # Add downloads path if downloads are enabled - if self.accept_downloads: - browser_args["downloads_path"] = self.downloads_path - - # Add proxy settings if a proxy is specified - if self.proxy: - proxy_settings = ProxySettings(server=self.proxy) - browser_args["proxy"] = proxy_settings - elif self.proxy_config: - proxy_settings = ProxySettings( - server=self.proxy_config.get("server"), - username=self.proxy_config.get("username"), - password=self.proxy_config.get("password") - ) - browser_args["proxy"] = proxy_settings - - try: - # Select the appropriate browser based on the browser_type - if self.browser_type == "firefox": - self.browser = await self.playwright.firefox.launch(**browser_args) - elif self.browser_type == "webkit": - if "viewport" not in browser_args: - browser_args["viewport"] = {"width": self.viewport_width, "height": self.viewport_height} - self.browser = await self.playwright.webkit.launch(**browser_args) - else: - if self.use_persistent_context and self.user_data_dir: - self.browser = await self.playwright.chromium.launch_persistent_context( - user_data_dir=self.user_data_dir, - accept_downloads=self.accept_downloads, - downloads_path=self.downloads_path if self.accept_downloads else None, - **browser_args - ) - self.default_context = self.browser - else: - self.browser = await self.playwright.chromium.launch(**browser_args) - self.default_context = self.browser - - except Exception as e: - # Fallback to chromium if Chrome channel fails - if "chrome" in str(e) and browser_args.get("channel") == "chrome": - browser_args["channel"] = "chromium" - if self.use_persistent_context and self.user_data_dir: - self.browser = await self.playwright.chromium.launch_persistent_context( - user_data_dir=self.user_data_dir, - **browser_args - ) - self.default_context = self.browser - else: - self.browser = await self.playwright.chromium.launch(**browser_args) - else: - raise - - await self.execute_hook('on_browser_created', self.browser) - - async def close(self): - if self.sleep_on_close: - await asyncio.sleep(0.5) - - # Close all active sessions - session_ids = list(self.sessions.keys()) - for session_id in session_ids: - await self.kill_session(session_id) - - if self.browser: - await self.browser.close() - self.browser = None - - if self.managed_browser: - await asyncio.sleep(0.5) - await self.managed_browser.cleanup() - self.managed_browser = None - - if self.playwright: - await self.playwright.stop() - self.playwright = None - - # Issue #256: Remove __del__ method to avoid potential issues with async cleanup - # def __del__(self): - # if self.browser or self.playwright: - # asyncio.get_event_loop().run_until_complete(self.close()) - - def set_hook(self, hook_type: str, hook: Callable): - if hook_type in self.hooks: - self.hooks[hook_type] = hook - else: - raise ValueError(f"Invalid hook type: {hook_type}") - - async def execute_hook(self, hook_type: str, *args, **kwargs): - hook = self.hooks.get(hook_type) - if hook: - if asyncio.iscoroutinefunction(hook): - return await hook(*args, **kwargs) - else: - return hook(*args, **kwargs) - return args[0] if args else None - - def update_user_agent(self, user_agent: str): - self.user_agent = user_agent - - def set_custom_headers(self, headers: Dict[str, str]): - self.headers = headers - - async def kill_session(self, session_id: str): - if session_id in self.sessions: - context, page, _ = self.sessions[session_id] - await page.close() - if not self.use_managed_browser: - await context.close() - del self.sessions[session_id] - - def _cleanup_expired_sessions(self): - current_time = time.time() - expired_sessions = [ - sid for sid, (_, _, last_used) in self.sessions.items() - if current_time - last_used > self.session_ttl - ] - for sid in expired_sessions: - asyncio.create_task(self.kill_session(sid)) - - async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): - wait_for = wait_for.strip() - - if wait_for.startswith('js:'): - # Explicitly specified JavaScript - js_code = wait_for[3:].strip() - return await self.csp_compliant_wait(page, js_code, timeout) - elif wait_for.startswith('css:'): - # Explicitly specified CSS selector - css_selector = wait_for[4:].strip() - try: - await page.wait_for_selector(css_selector, timeout=timeout) - except Error as e: - if 'Timeout' in str(e): - raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'") - else: - raise ValueError(f"Invalid CSS selector: '{css_selector}'") - else: - # Auto-detect based on content - if wait_for.startswith('()') or wait_for.startswith('function'): - # It's likely a JavaScript function - return await self.csp_compliant_wait(page, wait_for, timeout) - else: - # Assume it's a CSS selector first - try: - await page.wait_for_selector(wait_for, timeout=timeout) - except Error as e: - if 'Timeout' in str(e): - raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'") - else: - # If it's not a timeout error, it might be an invalid selector - # Let's try to evaluate it as a JavaScript function as a fallback - try: - return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout) - except Error: - raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. " - "It should be either a valid CSS selector, a JavaScript function, " - "or explicitly prefixed with 'js:' or 'css:'.") - - async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000): - wrapper_js = f""" - async () => {{ - const userFunction = {user_wait_function}; - const startTime = Date.now(); - while (true) {{ - if (await userFunction()) {{ - return true; - }} - if (Date.now() - startTime > {timeout}) {{ - throw new Error('Timeout waiting for condition'); - }} - await new Promise(resolve => setTimeout(resolve, 100)); - }} - }} - """ - - try: - await page.evaluate(wrapper_js) - except TimeoutError: - raise TimeoutError(f"Timeout after {timeout}ms waiting for condition") - except Exception as e: - raise RuntimeError(f"Error in wait condition: {str(e)}") - - async def process_iframes(self, page): - # Find all iframes - iframes = await page.query_selector_all('iframe') - - for i, iframe in enumerate(iframes): - try: - # Add a unique identifier to the iframe - await iframe.evaluate(f'(element) => element.id = "iframe-{i}"') - - # Get the frame associated with this iframe - frame = await iframe.content_frame() - - if frame: - # Wait for the frame to load - await frame.wait_for_load_state('load', timeout=30000) # 30 seconds timeout - - # Extract the content of the iframe's body - iframe_content = await frame.evaluate('() => document.body.innerHTML') - - # Generate a unique class name for this iframe - class_name = f'extracted-iframe-content-{i}' - - # Replace the iframe with a div containing the extracted content - _iframe = iframe_content.replace('`', '\\`') - await page.evaluate(f""" - () => {{ - const iframe = document.getElementById('iframe-{i}'); - const div = document.createElement('div'); - div.innerHTML = `{_iframe}`; - div.className = '{class_name}'; - iframe.replaceWith(div); - }} - """) - else: - # print(f"Warning: Could not access content frame for iframe {i}") - self.logger.warning( - message="Could not access content frame for iframe {index}", - tag="SCRAPE", - params={"index": i} - ) - except Exception as e: - self.logger.error( - message="Error processing iframe {index}: {error}", - tag="ERROR", - params={"index": i, "error": str(e)} - ) - # print(f"Error processing iframe {i}: {str(e)}") - - # Return the page object - return page - - async def create_session(self, **kwargs) -> str: - """Creates a new browser session and returns its ID.""" - if not self.browser: - await self.start() - - session_id = kwargs.get('session_id') or str(uuid.uuid4()) - - if self.use_managed_browser: - page = await self.default_context.new_page() - self.sessions[session_id] = (self.default_context, page, time.time()) - else: - if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: - context = self.browser - page = await context.new_page() - else: - context = await self.browser.new_context( - user_agent=kwargs.get("user_agent", self.user_agent), - viewport={"width": self.viewport_width, "height": self.viewport_height}, - proxy={"server": self.proxy} if self.proxy else None, - accept_downloads=self.accept_downloads, - storage_state=self.storage_state, - ignore_https_errors=True - ) - - if self.cookies: - await context.add_cookies(self.cookies) - await context.set_extra_http_headers(self.headers) - page = await context.new_page() - - self.sessions[session_id] = (context, page, time.time()) - - return session_id - - async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: - """ - Crawls a given URL or processes raw HTML/local file content based on the URL prefix. - - Args: - url (str): The URL to crawl. Supported prefixes: - - 'http://' or 'https://': Web URL to crawl. - - 'file://': Local file path to process. - - 'raw:': Raw HTML content to process. - **kwargs: Additional parameters: - - 'screenshot' (bool): Whether to take a screenshot. - - ... [other existing parameters] - - Returns: - AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot. - """ - response_headers = {} - status_code = 200 # Default to 200 for local/raw HTML - screenshot_requested = kwargs.get('screenshot', False) - screenshot_data = None - - if url.startswith(('http://', 'https://')): - # Proceed with standard web crawling - return await self._crawl_web(url, **kwargs) - - elif url.startswith('file://'): - # Process local file - local_file_path = url[7:] # Remove 'file://' prefix - if not os.path.exists(local_file_path): - raise FileNotFoundError(f"Local file not found: {local_file_path}") - with open(local_file_path, 'r', encoding='utf-8') as f: - html = f.read() - if screenshot_requested: - screenshot_data = await self._generate_screenshot_from_html(html) - return AsyncCrawlResponse( - html=html, - response_headers=response_headers, - status_code=status_code, - screenshot=screenshot_data, - get_delayed_content=None - ) - - elif url.startswith('raw:'): - # Process raw HTML content - raw_html = url[4:] # Remove 'raw:' prefix - html = raw_html - if screenshot_requested: - screenshot_data = await self._generate_screenshot_from_html(html) - return AsyncCrawlResponse( - html=html, - response_headers=response_headers, - status_code=status_code, - screenshot=screenshot_data, - get_delayed_content=None - ) - else: - raise ValueError("URL must start with 'http://', 'https://', 'file://', or 'raw:'") - - - async def _crawl_web(self, url: str, **kwargs) -> AsyncCrawlResponse: - """ - Existing web crawling logic remains unchanged. - - Args: - url (str): The web URL to crawl. - **kwargs: Additional parameters. - - Returns: - AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot. - """ - response_headers = {} - status_code = None - - # Reset downloaded files list for new crawl - self._downloaded_files = [] - - self._cleanup_expired_sessions() - session_id = kwargs.get("session_id") - - # Check if in kwargs we have user_agent that will override the default user_agent - user_agent = kwargs.get("user_agent", self.user_agent) - - # Generate random user agent if magic mode is enabled and user_agent_mode is not random - if kwargs.get("user_agent_mode") != "random" and kwargs.get("magic", False): - user_agent = UserAgentGenerator().generate( - **kwargs.get("user_agent_generator_config", {}) - ) - - # Handle page creation differently for managed browser - context = None - if self.use_managed_browser: - if session_id: - # Reuse existing session if available - context, page, _ = self.sessions.get(session_id, (None, None, None)) - if not page: - # Create new page in default context if session doesn't exist - page = await self.default_context.new_page() - self.sessions[session_id] = (self.default_context, page, time.time()) - else: - # Create new page in default context for non-session requests - page = await self.default_context.new_page() - else: - if session_id: - context, page, _ = self.sessions.get(session_id, (None, None, None)) - if not context: - if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: - # In persistent context, browser is the context - context = self.browser - else: - # Normal context creation for non-persistent or non-Chrome browsers - context = await self.browser.new_context( - user_agent=user_agent, - viewport={"width": self.viewport_width, "height": self.viewport_height}, - proxy={"server": self.proxy} if self.proxy else None, - java_script_enabled=True, - accept_downloads=self.accept_downloads, - storage_state=self.storage_state, - # downloads_path=self.downloads_path if self.accept_downloads else None - ) - await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) - if self.cookies: - await context.add_cookies(self.cookies) - await context.set_extra_http_headers(self.headers) - - page = await context.new_page() - self.sessions[session_id] = (context, page, time.time()) - else: - if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: - # In persistent context, browser is the context - context = self.browser - else: - # Normal context creation - context = await self.browser.new_context( - user_agent=user_agent, - # viewport={"width": 1920, "height": 1080}, - viewport={"width": self.viewport_width, "height": self.viewport_height}, - proxy={"server": self.proxy} if self.proxy else None, - accept_downloads=self.accept_downloads, - storage_state=self.storage_state, - ignore_https_errors=True # Add this line - ) - if self.cookies: - await context.add_cookies(self.cookies) - await context.set_extra_http_headers(self.headers) - - if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False): - # Inject scripts to override navigator properties - await context.add_init_script(""" - // Pass the Permissions Test. - const originalQuery = window.navigator.permissions.query; - window.navigator.permissions.query = (parameters) => ( - parameters.name === 'notifications' ? - Promise.resolve({ state: Notification.permission }) : - originalQuery(parameters) - ); - Object.defineProperty(navigator, 'webdriver', { - get: () => undefined - }); - window.navigator.chrome = { - runtime: {}, - // Add other properties if necessary - }; - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5], - }); - Object.defineProperty(navigator, 'languages', { - get: () => ['en-US', 'en'], - }); - Object.defineProperty(document, 'hidden', { - get: () => false - }); - Object.defineProperty(document, 'visibilityState', { - get: () => 'visible' - }); - """) - - page = await context.new_page() - if kwargs.get("magic", False): - await stealth_async(page, stealth_config) - - # Add console message and error logging - if kwargs.get("log_console", False): - page.on("console", lambda msg: print(f"Console: {msg.text}")) - page.on("pageerror", lambda exc: print(f"Page Error: {exc}")) - - try: - # Set up download handling if enabled - if self.accept_downloads: - page.on("download", lambda download: asyncio.create_task(self._handle_download(download))) - - if self.use_cached_html: - cache_file_path = os.path.join( - os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() - ) - if os.path.exists(cache_file_path): - html = "" - with open(cache_file_path, "r") as f: - html = f.read() - # retrieve response headers and status code from cache - with open(cache_file_path + ".meta", "r") as f: - meta = json.load(f) - response_headers = meta.get("response_headers", {}) - status_code = meta.get("status_code") - response = AsyncCrawlResponse( - html=html, response_headers=response_headers, status_code=status_code - ) - return response - - if not kwargs.get("js_only", False): - await self.execute_hook('before_goto', page, context = context, **kwargs) - - try: - response = await page.goto( - url, - # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), - wait_until=kwargs.get("wait_until", "domcontentloaded"), - timeout=kwargs.get("page_timeout", 60000), - ) - except Error as e: - raise RuntimeError(f"Failed on navigating ACS-GOTO :\n{str(e)}") - - await self.execute_hook('after_goto', page, context = context, **kwargs) - - # Get status code and headers - status_code = response.status - response_headers = response.headers - else: - status_code = 200 - response_headers = {} - - # Replace the current wait_for_selector line with this more robust check: - try: - # First wait for body to exist, regardless of visibility - await page.wait_for_selector('body', state='attached', timeout=30000) - - # Then wait for it to become visible by checking CSS - await page.wait_for_function(""" - () => { - const body = document.body; - const style = window.getComputedStyle(body); - return style.display !== 'none' && - style.visibility !== 'hidden' && - style.opacity !== '0'; - } - """, timeout=30000) - - except Error as e: - # If waiting fails, let's try to diagnose the issue - visibility_info = await page.evaluate(""" - () => { - const body = document.body; - const style = window.getComputedStyle(body); - return { - display: style.display, - visibility: style.visibility, - opacity: style.opacity, - hasContent: body.innerHTML.length, - classList: Array.from(body.classList) - } - } - """) - - if self.verbose: - print(f"Body visibility debug info: {visibility_info}") - - # Even if body is hidden, we might still want to proceed - if kwargs.get('ignore_body_visibility', True): - if self.verbose: - print("Proceeding despite hidden body...") - pass - else: - raise Error(f"Body element is hidden: {visibility_info}") - - # CONTENT LOADING ASSURANCE - if not self.text_only and (kwargs.get("wait_for_images", True) or kwargs.get("adjust_viewport_to_content", False)): - # Wait for network idle after initial load and images to load - # await page.wait_for_load_state("networkidle") - await page.wait_for_load_state("domcontentloaded") - await asyncio.sleep(0.1) - from playwright.async_api import TimeoutError as PlaywrightTimeoutError - try: - await page.wait_for_function("Array.from(document.images).every(img => img.complete)", timeout=1000) - # Check for TimeoutError and ignore it - except PlaywrightTimeoutError: - pass - - # After initial load, adjust viewport to content size - if not self.text_only and kwargs.get("adjust_viewport_to_content", False): - try: - # Get actual page dimensions - page_width = await page.evaluate("document.documentElement.scrollWidth") - page_height = await page.evaluate("document.documentElement.scrollHeight") - - target_width = self.viewport_width - target_height = int(target_width * page_width / page_height * 0.95) - await page.set_viewport_size({"width": target_width, "height": target_height}) - - # Compute scale factor - # We want the entire page visible: the scale should make both width and height fit - scale = min(target_width / page_width, target_height / page_height) - - # Now we call CDP to set metrics. - # We tell Chrome that the "device" is page_width x page_height in size, - # but we scale it down so everything fits within the real viewport. - cdp = await page.context.new_cdp_session(page) - await cdp.send('Emulation.setDeviceMetricsOverride', { - 'width': page_width, # full page width - 'height': page_height, # full page height - 'deviceScaleFactor': 1, # keep normal DPR - 'mobile': False, - 'scale': scale # scale the entire rendered content - }) - - except Exception as e: - self.logger.warning( - message="Failed to adjust viewport to content: {error}", - tag="VIEWPORT", - params={"error": str(e)} - ) - - # After viewport adjustment, handle page scanning if requested - if kwargs.get("scan_full_page", False): - try: - viewport_height = page.viewport_size.get("height", self.viewport_height) - current_position = viewport_height # Start with one viewport height - scroll_delay = kwargs.get("scroll_delay", 0.2) - - # Initial scroll - await page.evaluate(f"window.scrollTo(0, {current_position})") - await asyncio.sleep(scroll_delay) - - # Get height after first scroll to account for any dynamic content - total_height = await page.evaluate("document.documentElement.scrollHeight") - - while current_position < total_height: - current_position = min(current_position + viewport_height, total_height) - await page.evaluate(f"window.scrollTo(0, {current_position})") - await asyncio.sleep(scroll_delay) - - # Check for dynamic content - new_height = await page.evaluate("document.documentElement.scrollHeight") - if new_height > total_height: - total_height = new_height - - # Scroll back to top - await page.evaluate("window.scrollTo(0, 0)") - - except Exception as e: - self.logger.warning( - message="Failed to perform full page scan: {error}", - tag="PAGE_SCAN", - params={"error": str(e)} - ) - else: - # Scroll to the bottom of the page - await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") - - js_code = kwargs.get("js_code", kwargs.get("js", self.js_code)) - if js_code: - if isinstance(js_code, str): - await page.evaluate(js_code) - elif isinstance(js_code, list): - for js in js_code: - await page.evaluate(js) - - # await page.wait_for_timeout(100) - - # Check for on execution event - await self.execute_hook('on_execution_started', page, context = context, **kwargs) - - if kwargs.get("simulate_user", False) or kwargs.get("magic", False): - # Simulate user interactions - await page.mouse.move(100, 100) - await page.mouse.down() - await page.mouse.up() - await page.keyboard.press('ArrowDown') - - # Handle the wait_for parameter - wait_for = kwargs.get("wait_for") - if wait_for: - try: - await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000)) - except Exception as e: - raise RuntimeError(f"Wait condition failed: {str(e)}") - - # if not wait_for and js_code: - # await page.wait_for_load_state('networkidle', timeout=5000) - - # Update image dimensions - if not self.text_only: - update_image_dimensions_js = """ - () => { - return new Promise((resolve) => { - const filterImage = (img) => { - // Filter out images that are too small - if (img.width < 100 && img.height < 100) return false; - - // Filter out images that are not visible - const rect = img.getBoundingClientRect(); - if (rect.width === 0 || rect.height === 0) return false; - - // Filter out images with certain class names (e.g., icons, thumbnails) - if (img.classList.contains('icon') || img.classList.contains('thumbnail')) return false; - - // Filter out images with certain patterns in their src (e.g., placeholder images) - if (img.src.includes('placeholder') || img.src.includes('icon')) return false; - - return true; - }; - - const images = Array.from(document.querySelectorAll('img')).filter(filterImage); - let imagesLeft = images.length; - - if (imagesLeft === 0) { - resolve(); - return; - } - - const checkImage = (img) => { - if (img.complete && img.naturalWidth !== 0) { - img.setAttribute('width', img.naturalWidth); - img.setAttribute('height', img.naturalHeight); - imagesLeft--; - if (imagesLeft === 0) resolve(); - } - }; - - images.forEach(img => { - checkImage(img); - if (!img.complete) { - img.onload = () => { - checkImage(img); - }; - img.onerror = () => { - imagesLeft--; - if (imagesLeft === 0) resolve(); - }; - } - }); - - // Fallback timeout of 5 seconds - // setTimeout(() => resolve(), 5000); - resolve(); - }); - } - """ - - try: - try: - await page.wait_for_load_state( - # state="load", - state="domcontentloaded", - timeout=5 - ) - except PlaywrightTimeoutError: - pass - await page.evaluate(update_image_dimensions_js) - except Exception as e: - self.logger.error( - message="Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {error}", - tag="ERROR", - params={"error": str(e)} - ) - # raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}") - - # Wait a bit for any onload events to complete - # await page.wait_for_timeout(100) - - # Process iframes - if kwargs.get("process_iframes", False): - page = await self.process_iframes(page) - - await self.execute_hook('before_retrieve_html', page, context = context, **kwargs) - # Check if delay_before_return_html is set then wait for that time - delay_before_return_html = kwargs.get("delay_before_return_html", 0.1) - if delay_before_return_html: - await asyncio.sleep(delay_before_return_html) - - # Check for remove_overlay_elements parameter - if kwargs.get("remove_overlay_elements", False): - await self.remove_overlay_elements(page) - - html = await page.content() - await self.execute_hook('before_return_html', page, html, context = context, **kwargs) - - # Check if kwargs has screenshot=True then take screenshot - screenshot_data = None - if kwargs.get("screenshot"): - # Check we have screenshot_wait_for parameter, if we have simply wait for that time - screenshot_wait_for = kwargs.get("screenshot_wait_for") - if screenshot_wait_for: - await asyncio.sleep(screenshot_wait_for) - screenshot_data = await self.take_screenshot(page) - - # if self.verbose: - # print(f"[LOG] ✅ Crawled {url} successfully!") - - if self.use_cached_html: - cache_file_path = os.path.join( - os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() - ) - with open(cache_file_path, "w", encoding="utf-8") as f: - f.write(html) - # store response headers and status code in cache - with open(cache_file_path + ".meta", "w", encoding="utf-8") as f: - json.dump({ - "response_headers": response_headers, - "status_code": status_code - }, f) - - async def get_delayed_content(delay: float = 5.0) -> str: - if self.verbose: - print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}") - await asyncio.sleep(delay) - return await page.content() - - response = AsyncCrawlResponse( - html=html, - response_headers=response_headers, - status_code=status_code, - screenshot=screenshot_data, - get_delayed_content=get_delayed_content, - downloaded_files=self._downloaded_files if self._downloaded_files else None - ) - return response - except Error as e: - raise Error(f"async_crawler_strategy.py:_crawleb(): {str(e)}") - # finally: - # if not session_id: - # await page.close() - # await context.close() - - async def _handle_download(self, download): - """Handle file downloads.""" - try: - suggested_filename = download.suggested_filename - download_path = os.path.join(self.downloads_path, suggested_filename) - - self.logger.info( - message="Downloading {filename} to {path}", - tag="FETCH", - params={"filename": suggested_filename, "path": download_path} - ) - - start_time = time.perf_counter() - await download.save_as(download_path) - end_time = time.perf_counter() - self._downloaded_files.append(download_path) - - self.logger.success( - message="Downloaded {filename} successfully", - tag="COMPLETE", - params={"filename": suggested_filename, "path": download_path, "duration": f"{end_time - start_time:.2f}s"} - ) - except Exception as e: - self.logger.error( - message="Failed to handle download: {error}", - tag="ERROR", - params={"error": str(e)} - ) - - # if self.verbose: - # print(f"[ERROR] Failed to handle download: {str(e)}") - - async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: - semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed - semaphore = asyncio.Semaphore(semaphore_count) - - async def crawl_with_semaphore(url): - async with semaphore: - return await self.crawl(url, **kwargs) - - tasks = [crawl_with_semaphore(url) for url in urls] - results = await asyncio.gather(*tasks, return_exceptions=True) - return [result if not isinstance(result, Exception) else str(result) for result in results] - - async def remove_overlay_elements(self, page: Page) -> None: - """ - Removes popup overlays, modals, cookie notices, and other intrusive elements from the page. - - Args: - page (Page): The Playwright page instance - """ - remove_overlays_js = """ - async () => { - // Function to check if element is visible - const isVisible = (elem) => { - const style = window.getComputedStyle(elem); - return style.display !== 'none' && - style.visibility !== 'hidden' && - style.opacity !== '0'; - }; - - // Common selectors for popups and overlays - const commonSelectors = [ - // Close buttons first - 'button[class*="close" i]', 'button[class*="dismiss" i]', - 'button[aria-label*="close" i]', 'button[title*="close" i]', - 'a[class*="close" i]', 'span[class*="close" i]', - - // Cookie notices - '[class*="cookie-banner" i]', '[id*="cookie-banner" i]', - '[class*="cookie-consent" i]', '[id*="cookie-consent" i]', - - // Newsletter/subscription dialogs - '[class*="newsletter" i]', '[class*="subscribe" i]', - - // Generic popups/modals - '[class*="popup" i]', '[class*="modal" i]', - '[class*="overlay" i]', '[class*="dialog" i]', - '[role="dialog"]', '[role="alertdialog"]' - ]; - - // Try to click close buttons first - for (const selector of commonSelectors.slice(0, 6)) { - const closeButtons = document.querySelectorAll(selector); - for (const button of closeButtons) { - if (isVisible(button)) { - try { - button.click(); - await new Promise(resolve => setTimeout(resolve, 100)); - } catch (e) { - console.log('Error clicking button:', e); - } - } - } - } - - // Remove remaining overlay elements - const removeOverlays = () => { - // Find elements with high z-index - const allElements = document.querySelectorAll('*'); - for (const elem of allElements) { - const style = window.getComputedStyle(elem); - const zIndex = parseInt(style.zIndex); - const position = style.position; - - if ( - isVisible(elem) && - (zIndex > 999 || position === 'fixed' || position === 'absolute') && - ( - elem.offsetWidth > window.innerWidth * 0.5 || - elem.offsetHeight > window.innerHeight * 0.5 || - style.backgroundColor.includes('rgba') || - parseFloat(style.opacity) < 1 - ) - ) { - elem.remove(); - } - } - - // Remove elements matching common selectors - for (const selector of commonSelectors) { - const elements = document.querySelectorAll(selector); - elements.forEach(elem => { - if (isVisible(elem)) { - elem.remove(); - } - }); - } - }; - - // Remove overlay elements - removeOverlays(); - - // Remove any fixed/sticky position elements at the top/bottom - const removeFixedElements = () => { - const elements = document.querySelectorAll('*'); - elements.forEach(elem => { - const style = window.getComputedStyle(elem); - if ( - (style.position === 'fixed' || style.position === 'sticky') && - isVisible(elem) - ) { - elem.remove(); - } - }); - }; - - removeFixedElements(); - - // Remove empty block elements as: div, p, span, etc. - const removeEmptyBlockElements = () => { - const blockElements = document.querySelectorAll('div, p, span, section, article, header, footer, aside, nav, main, ul, ol, li, dl, dt, dd, h1, h2, h3, h4, h5, h6'); - blockElements.forEach(elem => { - if (elem.innerText.trim() === '') { - elem.remove(); - } - }); - }; - - // Remove margin-right and padding-right from body (often added by modal scripts) - document.body.style.marginRight = '0px'; - document.body.style.paddingRight = '0px'; - document.body.style.overflow = 'auto'; - - // Wait a bit for any animations to complete - await new Promise(resolve => setTimeout(resolve, 100)); - } - """ - - try: - await page.evaluate(remove_overlays_js) - await page.wait_for_timeout(500) # Wait for any animations to complete - except Exception as e: - self.logger.warning( - message="Failed to remove overlay elements: {error}", - tag="SCRAPE", - params={"error": str(e)} - ) - # if self.verbose: - # print(f"Warning: Failed to remove overlay elements: {str(e)}") - - async def take_screenshot(self, page: Page) -> str: - """ - Takes a screenshot of the current page. - - Args: - page (Page): The Playwright page instance - - Returns: - str: Base64-encoded screenshot image - """ - try: - # The page is already loaded, just take the screenshot - screenshot = await page.screenshot(full_page=True) - return base64.b64encode(screenshot).decode('utf-8') - except Exception as e: - error_message = f"Failed to take screenshot: {str(e)}" - self.logger.error( - message="Screenshot failed: {error}", - tag="ERROR", - params={"error": error_message} - ) - - - # Generate an error image - img = Image.new('RGB', (800, 600), color='black') - draw = ImageDraw.Draw(img) - font = ImageFont.load_default() - draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) - - buffered = BytesIO() - img.save(buffered, format="JPEG") - return base64.b64encode(buffered.getvalue()).decode('utf-8') - finally: - await page.close() - - async def export_storage_state(self, path: str = None) -> dict: - """ - Exports the current storage state (cookies, localStorage, sessionStorage) - to a JSON file at the specified path. - """ - if self.default_context: - state = await self.default_context.storage_state(path=path) - self.logger.info( - message="Exported storage state to {path}", - tag="INFO", - params={"path": path} - ) - return state - else: - self.logger.warning( - message="No default_context available to export storage state.", - tag="WARNING" - ) - - async def _generate_screenshot_from_html(self, html: str) -> Optional[str]: - """ - Generates a screenshot from raw HTML content. - - Args: - html (str): The HTML content to render and capture. - - Returns: - Optional[str]: Base64-encoded screenshot image or an error image if failed. - """ - try: - if not self.browser: - await self.start() - page = await self.browser.new_page() - await page.set_content(html, wait_until='networkidle') - screenshot = await page.screenshot(full_page=True) - await page.close() - return base64.b64encode(screenshot).decode('utf-8') - except Exception as e: - error_message = f"Failed to take screenshot: {str(e)}" - # print(error_message) - self.logger.error( - message="Screenshot failed: {error}", - tag="ERROR", - params={"error": error_message} - ) - - # Generate an error image - img = Image.new('RGB', (800, 600), color='black') - draw = ImageDraw.Draw(img) - font = ImageFont.load_default() - draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) - - buffered = BytesIO() - img.save(buffered, format="JPEG") - return base64.b64encode(buffered.getvalue()).decode('utf-8') - diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 553e9df4..3f040e13 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -17,9 +17,10 @@ import json import uuid from .js_snippet import load_js_script from .models import AsyncCrawlResponse -from .utils import create_box_message +from .utils import get_error_context from .user_agent_generator import UserAgentGenerator -from .config import SCREENSHOT_HEIGHT_TRESHOLD +from .config import SCREENSHOT_HEIGHT_TRESHOLD, DOWNLOAD_PAGE_TIMEOUT +from .async_configs import BrowserConfig, CrawlerRunConfig from playwright_stealth import StealthConfig, stealth_async @@ -64,7 +65,6 @@ BROWSER_DISABLE_OPTIONS = [ "--use-mock-keychain" ] - class ManagedBrowser: def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None, host: str = "localhost", debugging_port: int = 9222): self.browser_type = browser_type @@ -225,50 +225,44 @@ class ManagedBrowser: params={"error": str(e)} ) - class BrowserManager: - def __init__(self, use_managed_browser: bool, user_data_dir: Optional[str], headless: bool, logger, browser_type: str, proxy, proxy_config, chrome_channel: str, viewport_width: int, viewport_height: int, accept_downloads: bool, storage_state, ignore_https_errors: bool, java_script_enabled: bool, cookies: List[dict], headers: dict, extra_args: List[str], text_only: bool, light_mode: bool, user_agent: str, browser_hint: str, downloads_path: Optional[str]): - self.use_managed_browser = use_managed_browser - self.user_data_dir = user_data_dir - self.headless = headless + def __init__(self, browser_config: BrowserConfig, logger=None): + """ + Initialize the BrowserManager with a browser configuration. + + Args: + browser_config (BrowserConfig): Configuration object containing all browser settings + logger: Logger instance for recording events and errors + """ + self.config = browser_config self.logger = logger - self.browser_type = browser_type - self.proxy = proxy - self.proxy_config = proxy_config - self.chrome_channel = chrome_channel - self.viewport_width = viewport_width - self.viewport_height = viewport_height - self.accept_downloads = accept_downloads - self.storage_state = storage_state - self.ignore_https_errors = ignore_https_errors - self.java_script_enabled = java_script_enabled - self.cookies = cookies or [] - self.headers = headers or {} - self.extra_args = extra_args or [] - self.text_only = text_only - self.light_mode = light_mode + + # Browser state self.browser = None - self.default_context : BrowserContext = None + self.default_context = None self.managed_browser = None - self.sessions = {} - self.session_ttl = 1800 self.playwright = None - self.user_agent = user_agent - self.browser_hint = browser_hint - self.downloads_path = downloads_path + + # Session management + self.sessions = {} + self.session_ttl = 1800 # 30 minutes + + # Initialize ManagedBrowser if needed + if self.config.use_managed_browser: + self.managed_browser = ManagedBrowser( + browser_type=self.config.browser_type, + user_data_dir=self.config.user_data_dir, + headless=self.config.headless, + logger=self.logger + ) async def start(self): + """Start the browser instance and set up the default context.""" if self.playwright is None: from playwright.async_api import async_playwright self.playwright = await async_playwright().start() - if self.use_managed_browser: - self.managed_browser = ManagedBrowser( - browser_type=self.browser_type, - user_data_dir=self.user_data_dir, - headless=self.headless, - logger=self.logger - ) + if self.config.use_managed_browser: cdp_url = await self.managed_browser.start() self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) contexts = self.browser.contexts @@ -276,103 +270,159 @@ class BrowserManager: self.default_context = contexts[0] else: self.default_context = await self.browser.new_context( - viewport={"width": self.viewport_width, "height": self.viewport_height}, - storage_state=self.storage_state, - user_agent=self.headers.get("User-Agent"), - accept_downloads=self.accept_downloads, - ignore_https_errors=self.ignore_https_errors, - java_script_enabled=self.java_script_enabled + viewport={"width": self.config.viewport_width, "height": self.config.viewport_height}, + storage_state=self.config.storage_state, + user_agent=self.config.headers.get("User-Agent", self.config.user_agent), + accept_downloads=self.config.accept_downloads, + ignore_https_errors=self.config.ignore_https_errors, + java_script_enabled=self.config.java_script_enabled ) await self.setup_context(self.default_context) else: - browser_args = { - "headless": self.headless, - "args": [ - "--no-sandbox", - "--disable-dev-shm-usage", - "--no-first-run", - "--no-default-browser-check", - "--disable-infobars", - "--window-position=0,0", - "--ignore-certificate-errors", - "--ignore-certificate-errors-spki-list", - "--disable-blink-features=AutomationControlled", - "--window-position=400,0", - f"--window-size={self.viewport_width},{self.viewport_height}", - ] - } - - if self.light_mode: - browser_args["args"].extend(BROWSER_DISABLE_OPTIONS) - - if self.text_only: - browser_args["args"].extend(['--blink-settings=imagesEnabled=false','--disable-remote-fonts']) - - if self.chrome_channel: - browser_args["channel"] = self.chrome_channel - - if self.extra_args: - browser_args["args"].extend(self.extra_args) - - if self.accept_downloads: - browser_args["downloads_path"] = os.path.join(os.getcwd(), "downloads") - os.makedirs(browser_args["downloads_path"], exist_ok=True) - - if self.proxy: - from playwright.async_api import ProxySettings - proxy_settings = ProxySettings(server=self.proxy) - browser_args["proxy"] = proxy_settings - elif self.proxy_config: - from playwright.async_api import ProxySettings - proxy_settings = ProxySettings( - server=self.proxy_config.get("server"), - username=self.proxy_config.get("username"), - password=self.proxy_config.get("password") - ) - browser_args["proxy"] = proxy_settings - - if self.browser_type == "firefox": + browser_args = self._build_browser_args() + + # Launch appropriate browser type + if self.config.browser_type == "firefox": self.browser = await self.playwright.firefox.launch(**browser_args) - elif self.browser_type == "webkit": + elif self.config.browser_type == "webkit": self.browser = await self.playwright.webkit.launch(**browser_args) else: self.browser = await self.playwright.chromium.launch(**browser_args) self.default_context = self.browser - # Since default_context in non-managed mode is the browser, no setup needed here. + def _build_browser_args(self) -> dict: + """Build browser launch arguments from config.""" + args = [ + "--no-sandbox", + "--disable-dev-shm-usage", + "--no-first-run", + "--no-default-browser-check", + "--disable-infobars", + "--window-position=0,0", + "--ignore-certificate-errors", + "--ignore-certificate-errors-spki-list", + "--disable-blink-features=AutomationControlled", + "--window-position=400,0", + f"--window-size={self.config.viewport_width},{self.config.viewport_height}", + ] - async def setup_context(self, context : BrowserContext, is_default=False): - # Set extra headers - if self.headers: - await context.set_extra_http_headers(self.headers) + if self.config.light_mode: + args.extend(BROWSER_DISABLE_OPTIONS) - # Add cookies if any - if self.cookies: - await context.add_cookies(self.cookies) + if self.config.text_only: + args.extend(['--blink-settings=imagesEnabled=false', '--disable-remote-fonts']) - # Ensure storage_state if provided - if self.storage_state: - # If storage_state is a dictionary or file path, Playwright will handle it. + if self.config.extra_args: + args.extend(self.config.extra_args) + + browser_args = { + "headless": self.config.headless, + "args": args + } + + if self.config.chrome_channel: + browser_args["channel"] = self.config.chrome_channel + + if self.config.accept_downloads: + browser_args["downloads_path"] = (self.config.downloads_path or + os.path.join(os.getcwd(), "downloads")) + os.makedirs(browser_args["downloads_path"], exist_ok=True) + + if self.config.proxy or self.config.proxy_config: + from playwright.async_api import ProxySettings + proxy_settings = ( + ProxySettings(server=self.config.proxy) if self.config.proxy else + ProxySettings( + server=self.config.proxy_config.get("server"), + username=self.config.proxy_config.get("username"), + password=self.config.proxy_config.get("password") + ) + ) + browser_args["proxy"] = proxy_settings + + return browser_args + + async def setup_context(self, context: BrowserContext, is_default=False): + """Set up a browser context with the configured options.""" + if self.config.headers: + await context.set_extra_http_headers(self.config.headers) + + if self.config.cookies: + await context.add_cookies(self.config.cookies) + + if self.config.storage_state: await context.storage_state(path=None) - # If accept_downloads, set timeouts and ensure properties - if self.accept_downloads: - await context.set_default_timeout(60000) - await context.set_default_navigation_timeout(60000) - if self.downloads_path: + if self.config.accept_downloads: + context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT) + context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT) + if self.config.downloads_path: context._impl_obj._options["accept_downloads"] = True - context._impl_obj._options["downloads_path"] = self.downloads_path + context._impl_obj._options["downloads_path"] = self.config.downloads_path - # If we have a user_agent, override it along with sec-ch-ua - if self.user_agent: - # Merge headers if needed - combined_headers = {"User-Agent": self.user_agent, "sec-ch-ua": self.browser_hint} - combined_headers.update(self.headers) + # Handle user agent and browser hints + if self.config.user_agent: + combined_headers = { + "User-Agent": self.config.user_agent, + "sec-ch-ua": self.config.browser_hint + } + combined_headers.update(self.config.headers) await context.set_extra_http_headers(combined_headers) - + + async def get_page(self, session_id: Optional[str], user_agent: str): + """Get a page for the given session ID, creating a new one if needed.""" + self._cleanup_expired_sessions() + + if session_id and session_id in self.sessions: + context, page, _ = self.sessions[session_id] + self.sessions[session_id] = (context, page, time.time()) + return page, context + + if self.config.use_managed_browser: + context = self.default_context + page = await context.new_page() + else: + context = await self.browser.new_context( + user_agent=user_agent, + viewport={"width": self.config.viewport_width, "height": self.config.viewport_height}, + proxy={"server": self.config.proxy} if self.config.proxy else None, + accept_downloads=self.config.accept_downloads, + storage_state=self.config.storage_state, + ignore_https_errors=self.config.ignore_https_errors + ) + await self.setup_context(context) + page = await context.new_page() + + if session_id: + self.sessions[session_id] = (context, page, time.time()) + + return page, context + + async def kill_session(self, session_id: str): + """Kill a browser session and clean up resources.""" + if session_id in self.sessions: + context, page, _ = self.sessions[session_id] + await page.close() + if not self.config.use_managed_browser: + await context.close() + del self.sessions[session_id] + + def _cleanup_expired_sessions(self): + """Clean up expired sessions based on TTL.""" + current_time = time.time() + expired_sessions = [ + sid for sid, (_, _, last_used) in self.sessions.items() + if current_time - last_used > self.session_ttl + ] + for sid in expired_sessions: + asyncio.create_task(self.kill_session(sid)) + async def close(self): - # Close all active sessions + """Close all browser resources and clean up.""" + if self.config.sleep_on_close: + await asyncio.sleep(0.5) + session_ids = list(self.sessions.keys()) for session_id in session_ids: await self.kill_session(session_id) @@ -390,54 +440,6 @@ class BrowserManager: await self.playwright.stop() self.playwright = None - async def get_page(self, session_id: Optional[str], user_agent: str): - # Cleanup expired sessions - self._cleanup_expired_sessions() - - if session_id: - context, page, _ = self.sessions.get(session_id, (None, None, None)) - if context and page: - self.sessions[session_id] = (context, page, time.time()) - return page, context - - # Create a new context/page pair - if self.use_managed_browser: - context = self.default_context - page = await context.new_page() - else: - context = await self.browser.new_context( - user_agent=user_agent, - viewport={"width": self.viewport_width, "height": self.viewport_height}, - proxy={"server": self.proxy} if self.proxy else None, - accept_downloads=self.accept_downloads, - storage_state=self.storage_state, - ignore_https_errors=self.ignore_https_errors - ) - await self.setup_context(context) - page = await context.new_page() - - if session_id: - self.sessions[session_id] = (context, page, time.time()) - - return page, context - - async def kill_session(self, session_id: str): - if session_id in self.sessions: - context, page, _ = self.sessions[session_id] - await page.close() - if not self.use_managed_browser: - await context.close() - del self.sessions[session_id] - - def _cleanup_expired_sessions(self): - current_time = time.time() - expired_sessions = [ - sid for sid, (_, _, last_used) in self.sessions.items() - if current_time - last_used > self.session_ttl - ] - for sid in expired_sessions: - asyncio.create_task(self.kill_session(sid)) - class AsyncCrawlerStrategy(ABC): @abstractmethod async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: @@ -460,60 +462,24 @@ class AsyncCrawlerStrategy(ABC): pass class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - def __init__(self, use_cached_html=False, js_code=None, logger = None, **kwargs): - self.text_only = kwargs.get("text_only", False) - self.light_mode = kwargs.get("light_mode", False) + def __init__(self, browser_config: BrowserConfig = None, logger = None, **kwargs): + """ + Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration. + + Args: + browser_config (BrowserConfig): Configuration object containing browser settings. + If None, will be created from kwargs for backwards compatibility. + logger: Logger instance for recording events and errors. + **kwargs: Additional arguments for backwards compatibility and extending functionality. + """ + # Initialize browser config, either from provided object or kwargs + self.browser_config = browser_config or BrowserConfig.from_kwargs(kwargs) self.logger = logger - self.use_cached_html = use_cached_html - self.viewport_width = kwargs.get("viewport_width", 800 if self.text_only else 1920) - self.viewport_height = kwargs.get("viewport_height", 600 if self.text_only else 1080) - if self.text_only: - self.extra_args = kwargs.get("extra_args", []) + [ - '--disable-images', - '--disable-javascript', - '--disable-gpu', - '--disable-software-rasterizer', - '--disable-dev-shm-usage' - ] - - self.user_agent = kwargs.get( - "user_agent", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" - # "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36" - ) - user_agenr_generator = UserAgentGenerator() - if kwargs.get("user_agent_mode") == "random": - self.user_agent = user_agenr_generator.generate( - **kwargs.get("user_agent_generator_config", {}) - ) - self.pdf = kwargs.get("pdf", False) # New flag - self.screenshot_requested = kwargs.get('screenshot', False) + # Initialize session management + self._downloaded_files = [] - self.proxy = kwargs.get("proxy") - self.proxy_config = kwargs.get("proxy_config") - self.headless = kwargs.get("headless", True) - self.browser_type = kwargs.get("browser_type", "chromium") - self.headers = kwargs.get("headers", {}) - self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent) - self.headers.setdefault("sec-ch-ua", self.browser_hint) - self.cookies = kwargs.get("cookies", []) - self.storage_state = kwargs.get("storage_state", None) - self.sessions = {} - self.session_ttl = 1800 - self.js_code = js_code - self.verbose = kwargs.get("verbose", False) - self.playwright = None - self.browser = None - self.sleep_on_close = kwargs.get("sleep_on_close", False) - self.use_managed_browser = kwargs.get("use_managed_browser", False) - self.user_data_dir = kwargs.get("user_data_dir", None) - self.use_persistent_context = kwargs.get("use_persistent_context", False) - if self.use_persistent_context: - self.use_managed_browser = True - self.chrome_channel = kwargs.get("chrome_channel", "chrome") - self.managed_browser = None - self.default_context = None + # Initialize hooks system self.hooks = { 'on_browser_created': None, 'on_user_agent_updated': None, @@ -523,40 +489,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): 'before_return_html': None, 'before_retrieve_html': None } - self.extra_args = kwargs.get("extra_args", []) - self.ignore_https_errors = kwargs.get("ignore_https_errors", True) - self.java_script_enabled = kwargs.get("java_script_enabled", True) - self.accept_downloads = kwargs.get("accept_downloads", False) - self.downloads_path = kwargs.get("downloads_path") - self._downloaded_files = [] # Track downloaded files for current crawl - if self.accept_downloads and not self.downloads_path: - self.downloads_path = os.path.join(os.getcwd(), "downloads") - os.makedirs(self.downloads_path, exist_ok=True) - + + # Initialize browser manager with config self.browser_manager = BrowserManager( - use_managed_browser=self.use_managed_browser, - user_data_dir=self.user_data_dir, - headless=self.headless, - logger=self.logger, - browser_type=self.browser_type, - proxy=self.proxy, - proxy_config=self.proxy_config, - chrome_channel=self.chrome_channel, - viewport_width=self.viewport_width, - viewport_height=self.viewport_height, - accept_downloads=self.accept_downloads, - storage_state=self.storage_state, - ignore_https_errors=self.ignore_https_errors, - java_script_enabled=self.java_script_enabled, - cookies=self.cookies, - headers=self.headers, - extra_args=self.extra_args, - text_only=self.text_only, - light_mode=self.light_mode, - user_agent=self.user_agent, - browser_hint=self.browser_hint, - downloads_path=self.downloads_path - ) + browser_config=self.browser_config, + logger=self.logger + ) async def __aenter__(self): await self.start() @@ -570,15 +508,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.execute_hook('on_browser_created', self.browser_manager.browser, context = self.browser_manager.default_context) async def close(self): - if self.sleep_on_close: - await asyncio.sleep(0.5) - await self.browser_manager.close() - - # Issue #256: Remove __del__ method to avoid potential issues with async cleanup - # def __del__(self): - # if self.browser or self.playwright: - # asyncio.get_event_loop().run_until_complete(self.close()) + + async def kill_session(self, session_id: str): + # Log a warning message and no need kill session, in new version auto kill session + self.logger.warning( + message="Session auto-kill is enabled in the new version. No need to manually kill sessions.", + tag="WARNING" + ) + await self.browser_manager.kill_session(session_id) def set_hook(self, hook_type: str, hook: Callable): if hook_type in self.hooks: @@ -600,23 +538,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): def set_custom_headers(self, headers: Dict[str, str]): self.headers = headers - - async def kill_session(self, session_id: str): - if session_id in self.sessions: - context, page, _ = self.sessions[session_id] - await page.close() - if not self.use_managed_browser: - await context.close() - del self.sessions[session_id] - - def _cleanup_expired_sessions(self): - current_time = time.time() - expired_sessions = [ - sid for sid, (_, _, last_used) in self.sessions.items() - if current_time - last_used > self.session_ttl - ] - for sid in expired_sessions: - asyncio.create_task(self.kill_session(sid)) async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): wait_for = wait_for.strip() @@ -715,7 +636,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): }} """) else: - # print(f"Warning: Could not access content frame for iframe {i}") self.logger.warning( message="Could not access content frame for iframe {index}", tag="SCRAPE", @@ -727,7 +647,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): tag="ERROR", params={"index": i, "error": str(e)} ) - # print(f"Error processing iframe {i}: {str(e)}") # Return the page object return page @@ -743,7 +662,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): page, context = await self.browser_manager.get_page(session_id, user_agent) return session_id - async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: + async def crawl(self, url: str, config: CrawlerRunConfig, **kwargs) -> AsyncCrawlResponse: """ Crawls a given URL or processes raw HTML/local file content based on the URL prefix. @@ -759,15 +678,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): Returns: AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot. """ + config = config or CrawlerRunConfig.from_kwargs(kwargs) response_headers = {} - status_code = 200 # Default to 200 for local/raw HTML - screenshot_requested = kwargs.get("screenshot", self.screenshot_requested) - pdf_requested = kwargs.get("pdf", self.pdf) + status_code = 200 # Default for local/raw HTML screenshot_data = None if url.startswith(('http://', 'https://')): - # Proceed with standard web crawling - return await self._crawl_web(url, **kwargs) + return await self._crawl_web(url, config) elif url.startswith('file://'): # Process local file @@ -776,7 +693,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): raise FileNotFoundError(f"Local file not found: {local_file_path}") with open(local_file_path, 'r', encoding='utf-8') as f: html = f.read() - if screenshot_requested: + if config.screenshot: screenshot_data = await self._generate_screenshot_from_html(html) return AsyncCrawlResponse( html=html, @@ -790,7 +707,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Process raw HTML content raw_html = url[4:] # Remove 'raw:' prefix html = raw_html - if screenshot_requested: + if config.screenshot: screenshot_data = await self._generate_screenshot_from_html(html) return AsyncCrawlResponse( html=html, @@ -802,92 +719,85 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): else: raise ValueError("URL must start with 'http://', 'https://', 'file://', or 'raw:'") - async def _crawl_web(self, url: str, **kwargs) -> AsyncCrawlResponse: + async def _crawl_web(self, url: str, config: CrawlerRunConfig) -> AsyncCrawlResponse: + """ + Internal method to crawl web URLs with the specified configuration. + + Args: + url (str): The web URL to crawl + config (CrawlerRunConfig): Configuration object controlling the crawl behavior + + Returns: + AsyncCrawlResponse: The response containing HTML, headers, status code, and optional data + """ response_headers = {} status_code = None - screenshot_requested = kwargs.get("screenshot", self.screenshot_requested) - pdf_requested = kwargs.get("pdf", self.pdf) - # Reset downloaded files list for new crawl self._downloaded_files = [] - self._cleanup_expired_sessions() - session_id = kwargs.get("session_id") - - # Check if in kwargs we have user_agent that will override the default user_agent - user_agent = kwargs.get("user_agent", self.user_agent) - - # Generate random user agent if magic mode is enabled and user_agent_mode is not random - if kwargs.get("user_agent_mode") != "random" and kwargs.get("magic", False): + # Handle user agent with magic mode + user_agent = self.browser_config.user_agent + if config.magic and self.browser_config.user_agent_mode != "random": user_agent = UserAgentGenerator().generate( - **kwargs.get("user_agent_generator_config", {}) + **(self.browser_config.user_agent_generator_config or {}) ) - # Handle page creation differently for managed browser - page, context = await self.browser_manager.get_page(session_id, user_agent) + # Get page for session + page, context = await self.browser_manager.get_page( + session_id=config.session_id, + user_agent=user_agent + ) + + # Add default cookie await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) - if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False): - # Inject scripts to override navigator properties + # Handle navigator overrides + if config.override_navigator or config.simulate_user or config.magic: await context.add_init_script(load_js_script("navigator_overrider")) - # Add console message and error logging - if kwargs.get("log_console", False): - page.on("console", lambda msg: print(f"Console: {msg.text}")) - page.on("pageerror", lambda exc: print(f"Page Error: {exc}")) + # Set up console logging if requested + if config.log_console: + page.on("console", lambda msg: self.logger.debug( + message="Console: {msg}", + tag="CONSOLE", + params={"msg": msg.text} + )) + page.on("pageerror", lambda exc: self.logger.error( + message="Page error: {exc}", + tag="ERROR", + params={"exc": exc} + )) try: - # Set up download handling if enabled - if self.accept_downloads: + # Set up download handling + if self.browser_config.accept_downloads: page.on("download", lambda download: asyncio.create_task(self._handle_download(download))) - if self.use_cached_html: - cache_file_path = os.path.join( - os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() - ) - if os.path.exists(cache_file_path): - html = "" - with open(cache_file_path, "r") as f: - html = f.read() - # retrieve response headers and status code from cache - with open(cache_file_path + ".meta", "r") as f: - meta = json.load(f) - response_headers = meta.get("response_headers", {}) - status_code = meta.get("status_code") - response = AsyncCrawlResponse( - html=html, response_headers=response_headers, status_code=status_code - ) - return response - - if not kwargs.get("js_only", False): - await self.execute_hook('before_goto', page, context = context, **kwargs) + # Handle page navigation and content loading + if not config.js_only: + await self.execute_hook('before_goto', page, context=context) try: response = await page.goto( url, - # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), - wait_until=kwargs.get("wait_until", "domcontentloaded"), - timeout=kwargs.get("page_timeout", 60000), + wait_until=config.wait_until, + timeout=config.page_timeout ) except Error as e: - raise RuntimeError(f"Failed on navigating ACS-GOTO :\n{str(e)}") + raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") - await self.execute_hook('after_goto', page, context = context, **kwargs) + await self.execute_hook('after_goto', page, context=context) - # Get status code and headers status_code = response.status response_headers = response.headers else: status_code = 200 response_headers = {} - # Replace the current wait_for_selector line with this more robust check: + # Wait for body element and visibility try: - # First wait for body to exist, regardless of visibility await page.wait_for_selector('body', state='attached', timeout=30000) - - # Then wait for it to become visible by checking CSS await page.wait_for_function(""" () => { const body = document.body; @@ -897,9 +807,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): style.opacity !== '0'; } """, timeout=30000) - except Error as e: - # If waiting fails, let's try to diagnose the issue visibility_info = await page.evaluate(""" () => { const body = document.body; @@ -914,233 +822,195 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): } """) - if self.verbose: - print(f"Body visibility debug info: {visibility_info}") + if self.config.verbose: + self.logger.debug( + message="Body visibility info: {info}", + tag="DEBUG", + params={"info": visibility_info} + ) - # Even if body is hidden, we might still want to proceed - if kwargs.get('ignore_body_visibility', True): - if self.verbose: - print("Proceeding despite hidden body...") - pass - else: + if not config.ignore_body_visibility: raise Error(f"Body element is hidden: {visibility_info}") - - # CONTENT LOADING ASSURANCE - if not self.text_only and (kwargs.get("wait_for_images", True) or kwargs.get("adjust_viewport_to_content", False)): - # Wait for network idle after initial load and images to load - # await page.wait_for_load_state("networkidle") + + # Handle content loading and viewport adjustment + if not self.browser_config.text_only and (config.wait_for_images or config.adjust_viewport_to_content): await page.wait_for_load_state("domcontentloaded") await asyncio.sleep(0.1) - from playwright.async_api import TimeoutError as PlaywrightTimeoutError try: - await page.wait_for_function("Array.from(document.images).every(img => img.complete)", timeout=1000) - # Check for TimeoutError and ignore it + await page.wait_for_function( + "Array.from(document.images).every(img => img.complete)", + timeout=1000 + ) except PlaywrightTimeoutError: pass - - # After initial load, adjust viewport to content size - if not self.text_only and kwargs.get("adjust_viewport_to_content", False): - try: - # Get actual page dimensions + + # Adjust viewport if needed + if not self.browser_config.text_only and config.adjust_viewport_to_content: + try: page_width = await page.evaluate("document.documentElement.scrollWidth") page_height = await page.evaluate("document.documentElement.scrollHeight") - target_width = self.viewport_width + target_width = self.browser_config.viewport_width target_height = int(target_width * page_width / page_height * 0.95) await page.set_viewport_size({"width": target_width, "height": target_height}) - # Compute scale factor - # We want the entire page visible: the scale should make both width and height fit scale = min(target_width / page_width, target_height / page_height) - - # Now we call CDP to set metrics. - # We tell Chrome that the "device" is page_width x page_height in size, - # but we scale it down so everything fits within the real viewport. cdp = await page.context.new_cdp_session(page) await cdp.send('Emulation.setDeviceMetricsOverride', { - 'width': page_width, # full page width - 'height': page_height, # full page height - 'deviceScaleFactor': 1, # keep normal DPR + 'width': page_width, + 'height': page_height, + 'deviceScaleFactor': 1, 'mobile': False, - 'scale': scale # scale the entire rendered content + 'scale': scale }) - except Exception as e: self.logger.warning( message="Failed to adjust viewport to content: {error}", tag="VIEWPORT", params={"error": str(e)} - ) - - # After viewport adjustment, handle page scanning if requested - if kwargs.get("scan_full_page", False): - try: - viewport_height = page.viewport_size.get("height", self.viewport_height) - current_position = viewport_height # Start with one viewport height - scroll_delay = kwargs.get("scroll_delay", 0.2) - - # Initial scroll - await page.evaluate(f"window.scrollTo(0, {current_position})") - await asyncio.sleep(scroll_delay) - - # Get height after first scroll to account for any dynamic content - total_height = await page.evaluate("document.documentElement.scrollHeight") - - while current_position < total_height: - current_position = min(current_position + viewport_height, total_height) - await page.evaluate(f"window.scrollTo(0, {current_position})") - await asyncio.sleep(scroll_delay) - - # Check for dynamic content - new_height = await page.evaluate("document.documentElement.scrollHeight") - if new_height > total_height: - total_height = new_height - - # Scroll back to top - await page.evaluate("window.scrollTo(0, 0)") - - except Exception as e: - self.logger.warning( - message="Failed to perform full page scan: {error}", - tag="PAGE_SCAN", - params={"error": str(e)} ) - else: - # Scroll to the bottom of the page - await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") - js_code = kwargs.get("js_code", kwargs.get("js", self.js_code)) - if js_code: - if isinstance(js_code, str): - await page.evaluate(js_code) - elif isinstance(js_code, list): - for js in js_code: + # Handle full page scanning + if config.scan_full_page: + await self._handle_full_page_scan(page, config.scroll_delay) + + # Execute JavaScript if provided + if config.js_code: + if isinstance(config.js_code, str): + await page.evaluate(config.js_code) + elif isinstance(config.js_code, list): + for js in config.js_code: await page.evaluate(js) - # await page.wait_for_timeout(100) - - # Check for on execution event - await self.execute_hook('on_execution_started', page, context = context, **kwargs) - - if kwargs.get("simulate_user", False) or kwargs.get("magic", False): - # Simulate user interactions + await self.execute_hook('on_execution_started', page, context=context) + + # Handle user simulation + if config.simulate_user or config.magic: await page.mouse.move(100, 100) await page.mouse.down() await page.mouse.up() await page.keyboard.press('ArrowDown') - # Handle the wait_for parameter - wait_for = kwargs.get("wait_for") - if wait_for: + # Handle wait_for condition + if config.wait_for: try: - await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000)) + await self.smart_wait(page, config.wait_for, timeout=config.page_timeout) except Exception as e: raise RuntimeError(f"Wait condition failed: {str(e)}") - - # if not wait_for and js_code: - # await page.wait_for_load_state('networkidle', timeout=5000) - # Update image dimensions - if not self.text_only: + # Update image dimensions if needed + if not self.browser_config.text_only: update_image_dimensions_js = load_js_script("update_image_dimensions") - try: try: - await page.wait_for_load_state( - # state="load", - state="domcontentloaded", - timeout=5 - ) + await page.wait_for_load_state("domcontentloaded", timeout=5) except PlaywrightTimeoutError: pass await page.evaluate(update_image_dimensions_js) except Exception as e: self.logger.error( - message="Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {error}", + message="Error updating image dimensions: {error}", tag="ERROR", params={"error": str(e)} ) - # raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}") - # Wait a bit for any onload events to complete - # await page.wait_for_timeout(100) - - # Process iframes - if kwargs.get("process_iframes", False): + # Process iframes if needed + if config.process_iframes: page = await self.process_iframes(page) - - await self.execute_hook('before_retrieve_html', page, context = context, **kwargs) - # Check if delay_before_return_html is set then wait for that time - delay_before_return_html = kwargs.get("delay_before_return_html", 0.1) - if delay_before_return_html: - await asyncio.sleep(delay_before_return_html) - - # Check for remove_overlay_elements parameter - if kwargs.get("remove_overlay_elements", False): + + # Pre-content retrieval hooks and delay + await self.execute_hook('before_retrieve_html', page, context=context) + if config.delay_before_return_html: + await asyncio.sleep(config.delay_before_return_html) + + # Handle overlay removal + if config.remove_overlay_elements: await self.remove_overlay_elements(page) - + + # Get final HTML content html = await page.content() - await self.execute_hook('before_return_html', page, html, context = context, **kwargs) - + await self.execute_hook('before_return_html', page, html, context=context) + + # Handle PDF and screenshot generation start_export_time = time.perf_counter() pdf_data = None - if pdf_requested: - # Generate PDF once - pdf_data = await self.export_pdf(page) - - # Check if kwargs has screenshot=True then take screenshot screenshot_data = None - if screenshot_requested: #kwargs.get("screenshot"): - # Check we have screenshot_wait_for parameter, if we have simply wait for that time - screenshot_wait_for = kwargs.get("screenshot_wait_for") - if screenshot_wait_for: - await asyncio.sleep(screenshot_wait_for) - - screenshot_data = await self.take_screenshot(page, **kwargs) - end_export_time = time.perf_counter() + + if config.pdf: + pdf_data = await self.export_pdf(page) + + if config.screenshot: + if config.screenshot_wait_for: + await asyncio.sleep(config.screenshot_wait_for) + screenshot_data = await self.take_screenshot( + page, + screenshot_height_threshold=config.screenshot_height_threshold + ) + if screenshot_data or pdf_data: self.logger.info( message="Exporting PDF and taking screenshot took {duration:.2f}s", tag="EXPORT", - params={"duration": end_export_time - start_export_time} + params={"duration": time.perf_counter() - start_export_time} ) - - if self.use_cached_html: - cache_file_path = os.path.join( - os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() - ) - with open(cache_file_path, "w", encoding="utf-8") as f: - f.write(html) - # store response headers and status code in cache - with open(cache_file_path + ".meta", "w", encoding="utf-8") as f: - json.dump({ - "response_headers": response_headers, - "status_code": status_code - }, f) + # Define delayed content getter async def get_delayed_content(delay: float = 5.0) -> str: - if self.verbose: - print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}") + if self.config.verbose: + self.logger.info( + message="Waiting for {delay} seconds before retrieving content for {url}", + tag="INFO", + params={"delay": delay, "url": url} + ) await asyncio.sleep(delay) return await page.content() - - response = AsyncCrawlResponse( - html=html, - response_headers=response_headers, + + # Return complete response + return AsyncCrawlResponse( + html=html, + response_headers=response_headers, status_code=status_code, screenshot=screenshot_data, pdf_data=pdf_data, get_delayed_content=get_delayed_content, downloaded_files=self._downloaded_files if self._downloaded_files else None ) - return response - except Error as e: - raise Error(f"async_crawler_strategy.py:_crawleb(): {str(e)}") - # finally: - # if not session_id: - # await page.close() - # await context.close() + except Exception as e: + raise e + + async def _handle_full_page_scan(self, page: Page, scroll_delay: float): + """Helper method to handle full page scanning""" + try: + viewport_height = page.viewport_size.get("height", self.browser_config.viewport_height) + current_position = viewport_height + + await page.evaluate(f"window.scrollTo(0, {current_position})") + await asyncio.sleep(scroll_delay) + + total_height = await page.evaluate("document.documentElement.scrollHeight") + + while current_position < total_height: + current_position = min(current_position + viewport_height, total_height) + await page.evaluate(f"window.scrollTo(0, {current_position})") + await asyncio.sleep(scroll_delay) + + new_height = await page.evaluate("document.documentElement.scrollHeight") + if new_height > total_height: + total_height = new_height + + await page.evaluate("window.scrollTo(0, 0)") + + except Exception as e: + self.logger.warning( + message="Failed to perform full page scan: {error}", + tag="PAGE_SCAN", + params={"error": str(e)} + ) + else: + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + + async def _handle_download(self, download): """Handle file downloads.""" try: @@ -1170,8 +1040,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): params={"error": str(e)} ) - # if self.verbose: - # print(f"[ERROR] Failed to handle download: {str(e)}") async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed @@ -1192,7 +1060,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): Args: page (Page): The Playwright page instance """ - remove_overlays_js = load_js_script("remove_overlays") + remove_overlays_js = load_js_script("remove_overlay_elements") try: await page.evaluate(remove_overlays_js) @@ -1203,8 +1071,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): tag="SCRAPE", params={"error": str(e)} ) - # if self.verbose: - # print(f"Warning: Failed to remove overlay elements: {str(e)}") async def export_pdf(self, page: Page) -> bytes: """ @@ -1386,7 +1252,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return base64.b64encode(screenshot).decode('utf-8') except Exception as e: error_message = f"Failed to take screenshot: {str(e)}" - # print(error_message) self.logger.error( message="Screenshot failed: {error}", tag="ERROR", diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index 3c97e7d1..5cdafac2 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -1,4 +1,4 @@ -import os +import os, sys from pathlib import Path import aiosqlite import asyncio @@ -13,6 +13,7 @@ import aiofiles from .config import NEED_MIGRATION from .version_manager import VersionManager from .async_logger import AsyncLogger +from .utils import get_error_context, create_box_message # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -97,35 +98,84 @@ class AsyncDatabaseManager: @asynccontextmanager async def get_connection(self): - """Connection pool manager""" + """Connection pool manager with enhanced error handling""" if not self._initialized: - # Use an asyncio.Lock to ensure only one initialization occurs async with self.init_lock: if not self._initialized: - await self.initialize() - self._initialized = True + try: + await self.initialize() + self._initialized = True + except Exception as e: + import sys + error_context = get_error_context(sys.exc_info()) + self.logger.error( + message="Database initialization failed:\n{error}\n\nContext:\n{context}\n\nTraceback:\n{traceback}", + tag="ERROR", + force_verbose=True, + params={ + "error": str(e), + "context": error_context["code_context"], + "traceback": error_context["full_traceback"] + } + ) + raise await self.connection_semaphore.acquire() task_id = id(asyncio.current_task()) + try: async with self.pool_lock: if task_id not in self.connection_pool: - conn = await aiosqlite.connect( - self.db_path, - timeout=30.0 - ) - await conn.execute('PRAGMA journal_mode = WAL') - await conn.execute('PRAGMA busy_timeout = 5000') - self.connection_pool[task_id] = conn + try: + conn = await aiosqlite.connect( + self.db_path, + timeout=30.0 + ) + await conn.execute('PRAGMA journal_mode = WAL') + await conn.execute('PRAGMA busy_timeout = 5000') + + # Verify database structure + async with conn.execute("PRAGMA table_info(crawled_data)") as cursor: + columns = await cursor.fetchall() + column_names = [col[1] for col in columns] + expected_columns = { + 'url', 'html', 'cleaned_html', 'markdown', 'extracted_content', + 'success', 'media', 'links', 'metadata', 'screenshot', + 'response_headers', 'downloaded_files' + } + missing_columns = expected_columns - set(column_names) + if missing_columns: + raise ValueError(f"Database missing columns: {missing_columns}") + + self.connection_pool[task_id] = conn + except Exception as e: + import sys + error_context = get_error_context(sys.exc_info()) + error_message = ( + f"Unexpected error in db get_connection at line {error_context['line_no']} " + f"in {error_context['function']} ({error_context['filename']}):\n" + f"Error: {str(e)}\n\n" + f"Code context:\n{error_context['code_context']}" + ) + self.logger.error( + message=create_box_message(error_message, type= "error"), + ) + + raise yield self.connection_pool[task_id] except Exception as e: + import sys + error_context = get_error_context(sys.exc_info()) + error_message = ( + f"Unexpected error in db get_connection at line {error_context['line_no']} " + f"in {error_context['function']} ({error_context['filename']}):\n" + f"Error: {str(e)}\n\n" + f"Code context:\n{error_context['code_context']}" + ) self.logger.error( - message="Connection error: {error}", - tag="ERROR", - force_verbose=True, - params={"error": str(e)} + message=create_box_message(error_message, type= "error"), ) raise finally: @@ -230,7 +280,8 @@ class AsyncDatabaseManager: 'cleaned_html': row_dict['cleaned_html'], 'markdown': row_dict['markdown'], 'extracted_content': row_dict['extracted_content'], - 'screenshot': row_dict['screenshot'] + 'screenshot': row_dict['screenshot'], + 'screenshots': row_dict['screenshot'], } for field, hash_value in content_fields.items(): diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index fc6fe82f..72ef0bf8 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -1,4 +1,4 @@ -import os +import os, sys import time import warnings from enum import Enum @@ -17,7 +17,7 @@ from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawler from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode from .content_scraping_strategy import WebScrapingStrategy from .async_logger import AsyncLogger - +from .async_configs import BrowserConfig, CrawlerRunConfig from .config import ( MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, @@ -40,31 +40,20 @@ class AsyncWebCrawler: """ Asynchronous web crawler with flexible caching capabilities. - Migration Guide (from version X.X.X): + Migration Guide: Old way (deprecated): - crawler = AsyncWebCrawler(always_by_pass_cache=True) - result = await crawler.arun( - url="https://example.com", - bypass_cache=True, - no_cache_read=True, - no_cache_write=False - ) + crawler = AsyncWebCrawler(always_by_pass_cache=True, browser_type="chromium", headless=True) New way (recommended): - crawler = AsyncWebCrawler(always_bypass_cache=True) - result = await crawler.arun( - url="https://example.com", - cache_mode=CacheMode.WRITE_ONLY - ) - - To disable deprecation warnings: - Pass warning=False to suppress the warning. + browser_config = BrowserConfig(browser_type="chromium", headless=True) + crawler = AsyncWebCrawler(browser_config=browser_config) """ _domain_last_hit = {} def __init__( self, crawler_strategy: Optional[AsyncCrawlerStrategy] = None, + config: Optional[BrowserConfig] = None, always_bypass_cache: bool = False, always_by_pass_cache: Optional[bool] = None, # Deprecated parameter base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), @@ -75,28 +64,48 @@ class AsyncWebCrawler: Initialize the AsyncWebCrawler. Args: - crawler_strategy: Strategy for crawling web pages + crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy + config: Configuration object for browser settings. If None, will be created from kwargs always_bypass_cache: Whether to always bypass cache (new parameter) always_by_pass_cache: Deprecated, use always_bypass_cache instead base_directory: Base directory for storing cache + thread_safe: Whether to use thread-safe operations + **kwargs: Additional arguments for backwards compatibility """ - self.verbose = kwargs.get("verbose", False) + # Handle browser configuration + browser_config = config + if browser_config is not None: + if any(k in kwargs for k in ["browser_type", "headless", "viewport_width", "viewport_height"]): + self.logger.warning( + message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.", + tag="WARNING" + ) + else: + # Create browser config from kwargs for backwards compatibility + browser_config = BrowserConfig.from_kwargs(kwargs) + + self.browser_config = browser_config + + # Initialize logger first since other components may need it self.logger = AsyncLogger( log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"), - verbose=self.verbose, + verbose=self.browser_config.verbose, tag_width=10 ) + + # Initialize crawler strategy self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( - logger = self.logger, - **kwargs + browser_config=browser_config, + logger=self.logger, + **kwargs # Pass remaining kwargs for backwards compatibility ) - # Handle deprecated parameter + # Handle deprecated cache parameter if always_by_pass_cache is not None: if kwargs.get("warning", True): warnings.warn( - "'always_by_pass_cache' is deprecated and will be removed in version X.X.X. " + "'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. " "Use 'always_bypass_cache' instead. " "Pass warning=False to suppress this warning.", DeprecationWarning, @@ -106,13 +115,15 @@ class AsyncWebCrawler: else: self.always_bypass_cache = always_bypass_cache + # Thread safety setup self._lock = asyncio.Lock() if thread_safe else None + # Initialize directories self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai") os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) + self.ready = False - self.verbose = kwargs.get("verbose", False) async def __aenter__(self): await self.crawler_strategy.__aenter__() @@ -131,197 +142,198 @@ class AsyncWebCrawler: self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") self.ready = True + async def arun( - self, - url: str, - word_count_threshold=MIN_WORD_THRESHOLD, - extraction_strategy: ExtractionStrategy = None, - chunking_strategy: ChunkingStrategy = RegexChunking(), - content_filter: RelevantContentFilter = None, - cache_mode: Optional[CacheMode] = None, - # Deprecated parameters - bypass_cache: bool = False, - disable_cache: bool = False, - no_cache_read: bool = False, - no_cache_write: bool = False, - # Other parameters - css_selector: str = None, - screenshot: bool = False, - pdf: bool = False, - user_agent: str = None, - verbose=True, - **kwargs, - ) -> CrawlResult: - """ - Runs the crawler for a single source: URL (web, local file, or raw HTML). + self, + url: str, + config: Optional[CrawlerRunConfig] = None, + # Legacy parameters maintained for backwards compatibility + word_count_threshold=MIN_WORD_THRESHOLD, + extraction_strategy: ExtractionStrategy = None, + chunking_strategy: ChunkingStrategy = RegexChunking(), + content_filter: RelevantContentFilter = None, + cache_mode: Optional[CacheMode] = None, + # Deprecated cache parameters + bypass_cache: bool = False, + disable_cache: bool = False, + no_cache_read: bool = False, + no_cache_write: bool = False, + # Other legacy parameters + css_selector: str = None, + screenshot: bool = False, + pdf: bool = False, + user_agent: str = None, + verbose=True, + **kwargs, + ) -> CrawlResult: + """ + Runs the crawler for a single source: URL (web, local file, or raw HTML). - Migration from legacy cache parameters: + Migration Guide: Old way (deprecated): - await crawler.arun(url, bypass_cache=True, no_cache_read=True) + result = await crawler.arun( + url="https://example.com", + word_count_threshold=200, + screenshot=True, + ... + ) - New way: - await crawler.arun(url, cache_mode=CacheMode.BYPASS) + New way (recommended): + config = CrawlerRunConfig( + word_count_threshold=200, + screenshot=True, + ... + ) + result = await crawler.arun(url="https://example.com", crawler_config=config) - Args: - url: The URL to crawl (http://, https://, file://, or raw:) - cache_mode: Cache behavior control (recommended) - word_count_threshold: Minimum word count threshold - extraction_strategy: Strategy for content extraction - chunking_strategy: Strategy for content chunking - css_selector: CSS selector for content extraction - screenshot: Whether to capture screenshot - user_agent: Custom user agent - verbose: Enable verbose logging + Args: + url: The URL to crawl (http://, https://, file://, or raw:) + crawler_config: Configuration object controlling crawl behavior + [other parameters maintained for backwards compatibility] - Deprecated Args: - bypass_cache: Use cache_mode=CacheMode.BYPASS instead - disable_cache: Use cache_mode=CacheMode.DISABLED instead - no_cache_read: Use cache_mode=CacheMode.WRITE_ONLY instead - no_cache_write: Use cache_mode=CacheMode.READ_ONLY instead + Returns: + CrawlResult: The result of crawling and processing + """ + crawler_config = config + if not isinstance(url, str) or not url: + raise ValueError("Invalid URL, make sure the URL is a non-empty string") + + async with self._lock or self.nullcontext(): + try: + # Handle configuration + if crawler_config is not None: + if any(param is not None for param in [ + word_count_threshold, extraction_strategy, chunking_strategy, + content_filter, cache_mode, css_selector, screenshot, pdf + ]): + self.logger.warning( + message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.", + tag="WARNING" + ) + config = crawler_config + else: + # Merge all parameters into a single kwargs dict for config creation + config_kwargs = { + "word_count_threshold": word_count_threshold, + "extraction_strategy": extraction_strategy, + "chunking_strategy": chunking_strategy, + "content_filter": content_filter, + "cache_mode": cache_mode, + "bypass_cache": bypass_cache, + "disable_cache": disable_cache, + "no_cache_read": no_cache_read, + "no_cache_write": no_cache_write, + "css_selector": css_selector, + "screenshot": screenshot, + "pdf": pdf, + "verbose": verbose, + **kwargs + } + config = CrawlerRunConfig.from_kwargs(config_kwargs) - Returns: - CrawlResult: The result of crawling and processing - """ - # Check if url is not string and is not empty - if not isinstance(url, str) or not url: - raise ValueError("Invalid URL, make sure the URL is a non-empty string") - - async with self._lock or self.nullcontext(): # Lock for thread safety previously -> nullcontext(): - try: - # Handle deprecated parameters - if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): - if kwargs.get("warning", True): - warnings.warn( - "Cache control boolean flags are deprecated and will be removed in version X.X.X. " - "Use 'cache_mode' parameter instead. Examples:\n" - "- For bypass_cache=True, use cache_mode=CacheMode.BYPASS\n" - "- For disable_cache=True, use cache_mode=CacheMode.DISABLED\n" - "- For no_cache_read=True, use cache_mode=CacheMode.WRITE_ONLY\n" - "- For no_cache_write=True, use cache_mode=CacheMode.READ_ONLY\n" - "Pass warning=False to suppress this warning.", - DeprecationWarning, - stacklevel=2 - ) + # Handle deprecated cache parameters + if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): + if kwargs.get("warning", True): + warnings.warn( + "Cache control boolean flags are deprecated and will be removed in version 0.5.0. " + "Use 'cache_mode' parameter instead.", + DeprecationWarning, + stacklevel=2 + ) + + # Convert legacy parameters if cache_mode not provided + if config.cache_mode is None: + config.cache_mode = _legacy_to_cache_mode( + disable_cache=disable_cache, + bypass_cache=bypass_cache, + no_cache_read=no_cache_read, + no_cache_write=no_cache_write + ) - # Convert legacy parameters if cache_mode not provided - if cache_mode is None: - cache_mode = _legacy_to_cache_mode( - disable_cache=disable_cache, - bypass_cache=bypass_cache, - no_cache_read=no_cache_read, - no_cache_write=no_cache_write - ) - - # Default to ENABLED if no cache mode specified - if cache_mode is None: - cache_mode = CacheMode.ENABLED + # Default to ENABLED if no cache mode specified + if config.cache_mode is None: + config.cache_mode = CacheMode.ENABLED - # Create cache context - cache_context = CacheContext(url, cache_mode, self.always_bypass_cache) + # Create cache context + cache_context = CacheContext(url, config.cache_mode, self.always_bypass_cache) - extraction_strategy = extraction_strategy or NoExtractionStrategy() - extraction_strategy.verbose = verbose - if not isinstance(extraction_strategy, ExtractionStrategy): - raise ValueError("Unsupported extraction strategy") - if not isinstance(chunking_strategy, ChunkingStrategy): - raise ValueError("Unsupported chunking strategy") - - word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD) + # Initialize processing variables + async_response: AsyncCrawlResponse = None + cached_result = None + screenshot_data = None + pdf_data = None + extracted_content = None + start_time = time.perf_counter() - async_response: AsyncCrawlResponse = None - cached_result = None - screenshot_data = None - pdf_data = None - extracted_content = None - - start_time = time.perf_counter() - - # Try to get cached result if appropriate - if cache_context.should_read(): - cached_result = await async_db_manager.aget_cached_url(url) - - if cached_result: - html = sanitize_input_encode(cached_result.html) - extracted_content = sanitize_input_encode(cached_result.extracted_content or "") - if screenshot: + # Try to get cached result if appropriate + if cache_context.should_read(): + cached_result = await async_db_manager.aget_cached_url(url) + + if cached_result: + html = sanitize_input_encode(cached_result.html) + extracted_content = sanitize_input_encode(cached_result.extracted_content or "") + # If screenshot is requested but its not in cache, then set cache_result to None screenshot_data = cached_result.screenshot - if not screenshot_data: - cached_result = None - if pdf: pdf_data = cached_result.pdf - if not pdf_data: + if config.screenshot and not screenshot or config.pdf and not pdf: cached_result = None - # if verbose: - # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s") - self.logger.url_status( + + self.logger.url_status( url=cache_context.display_url, success=bool(html), timing=time.perf_counter() - start_time, tag="FETCH" - ) + ) + # Fetch fresh content if needed + if not cached_result or not html: + t1 = time.perf_counter() + + if user_agent: + self.crawler_strategy.update_user_agent(user_agent) + + # Pass config to crawl method + async_response = await self.crawler_strategy.crawl( + url, + config=config # Pass the entire config object + ) + + html = sanitize_input_encode(async_response.html) + screenshot_data = async_response.screenshot + pdf_data = async_response.pdf_data + + t2 = time.perf_counter() + self.logger.url_status( + url=cache_context.display_url, + success=bool(html), + timing=t2 - t1, + tag="FETCH" + ) - # Fetch fresh content if needed - if not cached_result or not html: - t1 = time.perf_counter() - - if user_agent: - self.crawler_strategy.update_user_agent(user_agent) - async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl( - url, - screenshot=screenshot, - pdf=pdf, - **kwargs + # Process the HTML content + crawl_result = await self.aprocess_html( + url=url, + html=html, + extracted_content=extracted_content, + config=config, # Pass the config object instead of individual parameters + screenshot=screenshot_data, + pdf_data=pdf_data, + verbose=config.verbose ) - html = sanitize_input_encode(async_response.html) - screenshot_data = async_response.screenshot - pdf_data = async_response.pdf_data - t2 = time.perf_counter() - self.logger.url_status( - url=cache_context.display_url, - success=bool(html), - timing=t2 - t1, - tag="FETCH" - ) - # if verbose: - # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") - # Process the HTML content - crawl_result = await self.aprocess_html( - url=url, - html=html, - extracted_content=extracted_content, - word_count_threshold=word_count_threshold, - extraction_strategy=extraction_strategy, - chunking_strategy=chunking_strategy, - content_filter=content_filter, - css_selector=css_selector, - screenshot=screenshot_data, - pdf_data=pdf_data, - verbose=verbose, - is_cached=bool(cached_result), - async_response=async_response, - is_web_url=cache_context.is_web_url, - is_local_file=cache_context.is_local_file, - is_raw_html=cache_context.is_raw_html, - **kwargs, - ) - - # Set response data - if async_response: - crawl_result.status_code = async_response.status_code - crawl_result.response_headers = async_response.response_headers - crawl_result.downloaded_files = async_response.downloaded_files - else: - crawl_result.status_code = 200 - crawl_result.response_headers = cached_result.response_headers if cached_result else {} + # Set response data + if async_response: + crawl_result.status_code = async_response.status_code + crawl_result.response_headers = async_response.response_headers + crawl_result.downloaded_files = async_response.downloaded_files + else: + crawl_result.status_code = 200 + crawl_result.response_headers = cached_result.response_headers if cached_result else {} - crawl_result.success = bool(html) - crawl_result.session_id = kwargs.get("session_id", None) + crawl_result.success = bool(html) + crawl_result.session_id = getattr(config, 'session_id', None) - # if verbose: - # print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") - self.logger.success( + self.logger.success( message="{url:.50}... | Status: {status} | Total: {timing}", tag="COMPLETE", params={ @@ -335,254 +347,312 @@ class AsyncWebCrawler: } ) - # Update cache if appropriate - if cache_context.should_write() and not bool(cached_result): - await async_db_manager.acache_url(crawl_result) + # Update cache if appropriate + if cache_context.should_write() and not bool(cached_result): + await async_db_manager.acache_url(crawl_result) - return crawl_result - - except Exception as e: - if not hasattr(e, "msg"): - e.msg = str(e) - # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") + return crawl_result + + except Exception as e: + error_context = get_error_context(sys.exc_info()) - self.logger.error_status( - # url=cache_context.display_url, - url=url, - error=create_box_message(e.msg, type = "error"), - tag="ERROR" - ) - return CrawlResult( - url=url, - html="", - success=False, - error_message=e.msg - ) - - async def arun_many( - self, - urls: List[str], - word_count_threshold=MIN_WORD_THRESHOLD, - extraction_strategy: ExtractionStrategy = None, - chunking_strategy: ChunkingStrategy = RegexChunking(), - content_filter: RelevantContentFilter = None, - cache_mode: Optional[CacheMode] = None, - # Deprecated parameters - bypass_cache: bool = False, - css_selector: str = None, - screenshot: bool = False, - pdf: bool = False, - user_agent: str = None, - verbose=True, - **kwargs, - ) -> List[CrawlResult]: - """ - Runs the crawler for multiple URLs concurrently. - - Migration from legacy parameters: - Old way (deprecated): - results = await crawler.arun_many(urls, bypass_cache=True) - - New way: - results = await crawler.arun_many(urls, cache_mode=CacheMode.BYPASS) - - Args: - urls: List of URLs to crawl - cache_mode: Cache behavior control (recommended) - [other parameters same as arun()] - - Returns: - List[CrawlResult]: Results for each URL - """ - if bypass_cache: - if kwargs.get("warning", True): - warnings.warn( - "'bypass_cache' is deprecated and will be removed in version X.X.X. " - "Use 'cache_mode=CacheMode.BYPASS' instead. " - "Pass warning=False to suppress this warning.", - DeprecationWarning, - stacklevel=2 - ) - if cache_mode is None: - cache_mode = CacheMode.BYPASS - - semaphore_count = kwargs.get('semaphore_count', 10) - semaphore = asyncio.Semaphore(semaphore_count) - - async def crawl_with_semaphore(url): - domain = urlparse(url).netloc - current_time = time.time() - - # print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}") - self.logger.debug( - message="Started task for {url:.50}...", - tag="PARALLEL", - params={"url": url} - ) - - # Get delay settings from kwargs or use defaults - mean_delay = kwargs.get('mean_delay', 0.1) # 0.5 seconds default mean delay - max_range = kwargs.get('max_range', 0.3) # 1 seconds default max additional delay - - # Check if we need to wait - if domain in self._domain_last_hit: - time_since_last = current_time - self._domain_last_hit[domain] - if time_since_last < mean_delay: - delay = mean_delay + random.uniform(0, max_range) - await asyncio.sleep(delay) - - # Update last hit time - self._domain_last_hit[domain] = current_time + error_message = ( + f"Unexpected error in _crawl_web at line {error_context['line_no']} " + f"in {error_context['function']} ({error_context['filename']}):\n" + f"Error: {str(e)}\n\n" + f"Code context:\n{error_context['code_context']}" + ) + # if not hasattr(e, "msg"): + # e.msg = str(e) - async with semaphore: - return await self.arun( - url, - word_count_threshold=word_count_threshold, - extraction_strategy=extraction_strategy, - chunking_strategy=chunking_strategy, - content_filter=content_filter, - cache_mode=cache_mode, - css_selector=css_selector, - screenshot=screenshot, - user_agent=user_agent, - verbose=verbose, - **kwargs, - ) - - # Print start message - # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}") - self.logger.info( - message="Starting concurrent crawling for {count} URLs...", - tag="INIT", - params={"count": len(urls)} - ) - start_time = time.perf_counter() - tasks = [crawl_with_semaphore(url) for url in urls] - results = await asyncio.gather(*tasks, return_exceptions=True) - end_time = time.perf_counter() - # print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}") - self.logger.success( - message="Concurrent crawling completed for {count} URLs | " + Fore.YELLOW + " Total time: {timing}" + Style.RESET_ALL, - tag="COMPLETE", - params={ - "count": len(urls), - "timing": f"{end_time - start_time:.2f}s" - }, - colors={"timing": Fore.YELLOW} - ) - return [result if not isinstance(result, Exception) else str(result) for result in results] - + self.logger.error_status( + url=url, + error=create_box_message(error_message, type="error"), + tag="ERROR" + ) + + return CrawlResult( + url=url, + html="", + success=False, + error_message=error_message + ) async def aprocess_html( - self, - url: str, - html: str, - extracted_content: str, - word_count_threshold: int, - extraction_strategy: ExtractionStrategy, - chunking_strategy: ChunkingStrategy, - content_filter: RelevantContentFilter, - css_selector: str, - screenshot: str, - verbose: bool, - **kwargs, - ) -> CrawlResult: - # Extract content from HTML - try: - _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" - t1 = time.perf_counter() - scrapping_strategy = WebScrapingStrategy( - logger=self.logger, - ) - # result = await scrapping_strategy.ascrap( - result = scrapping_strategy.scrap( - url, - html, - word_count_threshold=word_count_threshold, - css_selector=css_selector, - only_text=kwargs.pop("only_text", False), - image_description_min_word_threshold=kwargs.pop( - "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD - ), - content_filter = content_filter, - **kwargs, - ) + self, + url: str, + html: str, + extracted_content: str, + config: CrawlerRunConfig, + screenshot: str, + pdf_data: str, + verbose: bool, + **kwargs, + ) -> CrawlResult: + """ + Process HTML content using the provided configuration. + + Args: + url: The URL being processed + html: Raw HTML content + extracted_content: Previously extracted content (if any) + config: Configuration object controlling processing behavior + screenshot: Screenshot data (if any) + verbose: Whether to enable verbose logging + **kwargs: Additional parameters for backwards compatibility + + Returns: + CrawlResult: Processed result containing extracted and formatted content + """ + try: + _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" + t1 = time.perf_counter() - if result is None: - raise ValueError(f"Process HTML, Failed to extract content from the website: {url}") - except InvalidCSSSelectorError as e: - raise ValueError(str(e)) - except Exception as e: - raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") + # Initialize scraping strategy + scrapping_strategy = WebScrapingStrategy(logger=self.logger) - markdown_v2: MarkdownGenerationResult = result.get("markdown_v2", None) - - cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) - markdown = sanitize_input_encode(result.get("markdown", "")) - fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) - fit_html = sanitize_input_encode(result.get("fit_html", "")) - media = result.get("media", []) - links = result.get("links", []) - metadata = result.get("metadata", {}) - - # if verbose: - # print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms") - self.logger.info( - message="Processed {url:.50}... | Time: {timing}ms", - tag="SCRAPE", - params={ - "url": _url, - "timing": int((time.perf_counter() - t1) * 1000) - } - ) + # Process HTML content + result = scrapping_strategy.scrap( + url, + html, + word_count_threshold=config.word_count_threshold, + css_selector=config.css_selector, + only_text=config.only_text, + image_description_min_word_threshold=config.image_description_min_word_threshold, + content_filter=config.content_filter + ) + if result is None: + raise ValueError(f"Process HTML, Failed to extract content from the website: {url}") - if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy): - t1 = time.perf_counter() - # Check if extraction strategy is type of JsonCssExtractionStrategy - if isinstance(extraction_strategy, JsonCssExtractionStrategy) or isinstance(extraction_strategy, JsonCssExtractionStrategy): - extraction_strategy.verbose = verbose - extracted_content = extraction_strategy.run(url, [html]) - extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) - else: - sections = chunking_strategy.chunk(markdown) - extracted_content = extraction_strategy.run(url, sections) - extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) - # if verbose: - # print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}") + except InvalidCSSSelectorError as e: + raise ValueError(str(e)) + except Exception as e: + raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") + + # Extract results + markdown_v2 = result.get("markdown_v2", None) + cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) + markdown = sanitize_input_encode(result.get("markdown", "")) + fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) + fit_html = sanitize_input_encode(result.get("fit_html", "")) + media = result.get("media", []) + links = result.get("links", []) + metadata = result.get("metadata", {}) + + # Log processing completion self.logger.info( - message="Completed for {url:.50}... | Time: {timing}s", - tag="EXTRACT", + message="Processed {url:.50}... | Time: {timing}ms", + tag="SCRAPE", params={ "url": _url, - "timing": time.perf_counter() - t1 + "timing": int((time.perf_counter() - t1) * 1000) } ) - screenshot = None if not screenshot else screenshot - pdf_data = kwargs.get("pdf_data", None) - - - if kwargs.get("prettiify", False): - cleaned_html = fast_format_html(cleaned_html) - - return CrawlResult( - url=url, - html=html, - cleaned_html=cleaned_html, - markdown_v2=markdown_v2, - markdown=markdown, - fit_markdown=fit_markdown, - fit_html= fit_html, - media=media, - links=links, - metadata=metadata, - screenshot=screenshot, - pdf=pdf_data, - extracted_content=extracted_content, - success=True, - error_message="", - ) + # Handle content extraction if needed + if (extracted_content is None and + config.extraction_strategy and + config.chunking_strategy and + not isinstance(config.extraction_strategy, NoExtractionStrategy)): + + t1 = time.perf_counter() + + # Handle different extraction strategy types + if isinstance(config.extraction_strategy, (JsonCssExtractionStrategy, JsonCssExtractionStrategy)): + config.extraction_strategy.verbose = verbose + extracted_content = config.extraction_strategy.run(url, [html]) + extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) + else: + sections = config.chunking_strategy.chunk(markdown) + extracted_content = config.extraction_strategy.run(url, sections) + extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) + + # Log extraction completion + self.logger.info( + message="Completed for {url:.50}... | Time: {timing}s", + tag="EXTRACT", + params={ + "url": _url, + "timing": time.perf_counter() - t1 + } + ) + + # Handle screenshot and PDF data + screenshot_data = None if not screenshot else screenshot + pdf_data = None if not pdf_data else pdf_data + + # Apply HTML formatting if requested + if config.prettiify: + cleaned_html = fast_format_html(cleaned_html) + + # Return complete crawl result + return CrawlResult( + url=url, + html=html, + cleaned_html=cleaned_html, + markdown_v2=markdown_v2, + markdown=markdown, + fit_markdown=fit_markdown, + fit_html=fit_html, + media=media, + links=links, + metadata=metadata, + screenshot=screenshot_data, + pdf=pdf_data, + extracted_content=extracted_content, + success=True, + error_message="", + ) + + async def arun_many( + self, + urls: List[str], + config: Optional[CrawlerRunConfig] = None, + # Legacy parameters maintained for backwards compatibility + word_count_threshold=MIN_WORD_THRESHOLD, + extraction_strategy: ExtractionStrategy = None, + chunking_strategy: ChunkingStrategy = RegexChunking(), + content_filter: RelevantContentFilter = None, + cache_mode: Optional[CacheMode] = None, + bypass_cache: bool = False, + css_selector: str = None, + screenshot: bool = False, + pdf: bool = False, + user_agent: str = None, + verbose=True, + **kwargs, + ) -> List[CrawlResult]: + """ + Runs the crawler for multiple URLs concurrently. + + Migration Guide: + Old way (deprecated): + results = await crawler.arun_many( + urls, + word_count_threshold=200, + screenshot=True, + ... + ) + + New way (recommended): + config = CrawlerRunConfig( + word_count_threshold=200, + screenshot=True, + ... + ) + results = await crawler.arun_many(urls, crawler_config=config) + + Args: + urls: List of URLs to crawl + crawler_config: Configuration object controlling crawl behavior for all URLs + [other parameters maintained for backwards compatibility] + + Returns: + List[CrawlResult]: Results for each URL + """ + crawler_config = config + # Handle configuration + if crawler_config is not None: + if any(param is not None for param in [ + word_count_threshold, extraction_strategy, chunking_strategy, + content_filter, cache_mode, css_selector, screenshot, pdf + ]): + self.logger.warning( + message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.", + tag="WARNING" + ) + config = crawler_config + else: + # Merge all parameters into a single kwargs dict for config creation + config_kwargs = { + "word_count_threshold": word_count_threshold, + "extraction_strategy": extraction_strategy, + "chunking_strategy": chunking_strategy, + "content_filter": content_filter, + "cache_mode": cache_mode, + "bypass_cache": bypass_cache, + "css_selector": css_selector, + "screenshot": screenshot, + "pdf": pdf, + "verbose": verbose, + **kwargs + } + config = CrawlerRunConfig.from_kwargs(config_kwargs) + + if bypass_cache: + if kwargs.get("warning", True): + warnings.warn( + "'bypass_cache' is deprecated and will be removed in version 0.5.0. " + "Use 'cache_mode=CacheMode.BYPASS' instead. " + "Pass warning=False to suppress this warning.", + DeprecationWarning, + stacklevel=2 + ) + if config.cache_mode is None: + config.cache_mode = CacheMode.BYPASS + + semaphore_count = config.semaphore_count or 5 + semaphore = asyncio.Semaphore(semaphore_count) + + async def crawl_with_semaphore(url): + # Handle rate limiting per domain + domain = urlparse(url).netloc + current_time = time.time() + + self.logger.debug( + message="Started task for {url:.50}...", + tag="PARALLEL", + params={"url": url} + ) + + # Get delay settings from config + mean_delay = config.mean_delay + max_range = config.max_range + + # Apply rate limiting + if domain in self._domain_last_hit: + time_since_last = current_time - self._domain_last_hit[domain] + if time_since_last < mean_delay: + delay = mean_delay + random.uniform(0, max_range) + await asyncio.sleep(delay) + + self._domain_last_hit[domain] = current_time + + async with semaphore: + return await self.arun( + url, + crawler_config=config, # Pass the entire config object + user_agent=user_agent # Maintain user_agent override capability + ) + + # Log start of concurrent crawling + self.logger.info( + message="Starting concurrent crawling for {count} URLs...", + tag="INIT", + params={"count": len(urls)} + ) + + # Execute concurrent crawls + start_time = time.perf_counter() + tasks = [crawl_with_semaphore(url) for url in urls] + results = await asyncio.gather(*tasks, return_exceptions=True) + end_time = time.perf_counter() + + # Log completion + self.logger.success( + message="Concurrent crawling completed for {count} URLs | Total time: {timing}", + tag="COMPLETE", + params={ + "count": len(urls), + "timing": f"{end_time - start_time:.2f}s" + }, + colors={ + "timing": Fore.YELLOW + } + ) + + return [result if not isinstance(result, Exception) else str(result) for result in results] async def aclear_cache(self): """Clear the cache database.""" diff --git a/crawl4ai/config.py b/crawl4ai/config.py index e17ff34f..7c8a9314 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -57,4 +57,6 @@ MAX_METRICS_HISTORY = 1000 NEED_MIGRATION = True URL_LOG_SHORTEN_LENGTH = 30 SHOW_DEPRECATION_WARNINGS = True -SCREENSHOT_HEIGHT_TRESHOLD = 10000 \ No newline at end of file +SCREENSHOT_HEIGHT_TRESHOLD = 10000 +PAGE_TIMEOUT=60000 +DOWNLOAD_PAGE_TIMEOUT=60000 \ No newline at end of file diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 8a12ff0c..7ecc22da 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -29,7 +29,7 @@ class InvalidCSSSelectorError(Exception): def create_box_message( message: str, type: str = "info", - width: int = 80, + width: int = 120, add_newlines: bool = True, double_line: bool = False ) -> str: @@ -1223,7 +1223,8 @@ def ensure_content_dirs(base_path: str) -> Dict[str, str]: 'cleaned': 'cleaned_html', 'markdown': 'markdown_content', 'extracted': 'extracted_content', - 'screenshots': 'screenshots' + 'screenshots': 'screenshots', + 'screenshot': 'screenshots' } content_paths = {} @@ -1232,4 +1233,60 @@ def ensure_content_dirs(base_path: str) -> Dict[str, str]: os.makedirs(path, exist_ok=True) content_paths[key] = path - return content_paths \ No newline at end of file + return content_paths + +def get_error_context(exc_info, context_lines: int = 5): + """ + Extract error context with more reliable line number tracking. + + Args: + exc_info: The exception info from sys.exc_info() + context_lines: Number of lines to show before and after the error + + Returns: + dict: Error context information + """ + import traceback + import linecache + import os + + # Get the full traceback + tb = traceback.extract_tb(exc_info[2]) + + # Get the last frame (where the error occurred) + last_frame = tb[-1] + filename = last_frame.filename + line_no = last_frame.lineno + func_name = last_frame.name + + # Get the source code context using linecache + # This is more reliable than inspect.getsourcelines + context_start = max(1, line_no - context_lines) + context_end = line_no + context_lines + 1 + + # Build the context lines with line numbers + context_lines = [] + for i in range(context_start, context_end): + line = linecache.getline(filename, i) + if line: + # Remove any trailing whitespace/newlines and add the pointer for error line + line = line.rstrip() + pointer = '→' if i == line_no else ' ' + context_lines.append(f"{i:4d} {pointer} {line}") + + # Join the lines with newlines + code_context = '\n'.join(context_lines) + + # Get relative path for cleaner output + try: + rel_path = os.path.relpath(filename) + except ValueError: + # Fallback if relpath fails (can happen on Windows with different drives) + rel_path = filename + + return { + "filename": rel_path, + "line_no": line_no, + "function": func_name, + "code_context": code_context + } \ No newline at end of file diff --git a/docs/examples/quickstart_async.config.py b/docs/examples/quickstart_async.config.py new file mode 100644 index 00000000..e1feba55 --- /dev/null +++ b/docs/examples/quickstart_async.config.py @@ -0,0 +1,517 @@ +import os, sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) +os.environ['FIRECRAWL_API_KEY'] = "fc-84b370ccfad44beabc686b38f1769692" + +import asyncio +import time +import json +import re +from typing import Dict, List +from bs4 import BeautifulSoup +from pydantic import BaseModel, Field +from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy + +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + +print("Crawl4AI: Advanced Web Crawling and Data Extraction") +print("GitHub Repository: https://github.com/unclecode/crawl4ai") +print("Twitter: @unclecode") +print("Website: https://crawl4ai.com") + +# Basic Example - Simple Crawl +async def simple_crawl(): + print("\n--- Basic Usage ---") + browser_config = BrowserConfig(headless=True) + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + config=crawler_config + ) + print(result.markdown[:500]) + +# JavaScript Execution Example +async def simple_example_with_running_js_code(): + print("\n--- Executing JavaScript and Using CSS Selectors ---") + + browser_config = BrowserConfig( + headless=True, + java_script_enabled=True + ) + + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"], + # wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }" + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + config=crawler_config + ) + print(result.markdown[:500]) + +# CSS Selector Example +async def simple_example_with_css_selector(): + print("\n--- Using CSS Selectors ---") + browser_config = BrowserConfig(headless=True) + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + css_selector=".wide-tease-item__description" + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + config=crawler_config + ) + print(result.markdown[:500]) + +# Proxy Example +async def use_proxy(): + print("\n--- Using a Proxy ---") + browser_config = BrowserConfig( + headless=True, + proxy="http://your-proxy-url:port" + ) + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + config=crawler_config + ) + if result.success: + print(result.markdown[:500]) + +# Screenshot Example +async def capture_and_save_screenshot(url: str, output_path: str): + browser_config = BrowserConfig(headless=True) + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + screenshot=True + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url=url, + config=crawler_config + ) + + if result.success and result.screenshot: + import base64 + screenshot_data = base64.b64decode(result.screenshot) + with open(output_path, 'wb') as f: + f.write(screenshot_data) + print(f"Screenshot saved successfully to {output_path}") + else: + print("Failed to capture screenshot") + +# LLM Extraction Example +class OpenAIModelFee(BaseModel): + model_name: str = Field(..., description="Name of the OpenAI model.") + input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") + output_fee: str = Field(..., description="Fee for output token for the OpenAI model.") + +async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None): + print(f"\n--- Extracting Structured Data with {provider} ---") + + if api_token is None and provider != "ollama": + print(f"API token is required for {provider}. Skipping this example.") + return + + browser_config = BrowserConfig(headless=True) + + extra_args = { + "temperature": 0, + "top_p": 0.9, + "max_tokens": 2000 + } + if extra_headers: + extra_args["extra_headers"] = extra_headers + + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + word_count_threshold=1, + extraction_strategy=LLMExtractionStrategy( + provider=provider, + api_token=api_token, + schema=OpenAIModelFee.model_json_schema(), + extraction_type="schema", + instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. + Do not miss any models in the entire content.""", + extra_args=extra_args + ) + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://openai.com/api/pricing/", + config=crawler_config + ) + print(result.extracted_content) + +# CSS Extraction Example +async def extract_structured_data_using_css_extractor(): + print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") + schema = { + "name": "KidoCode Courses", + "baseSelector": "section.charge-methodology .w-tab-content > div", + "fields": [ + { + "name": "section_title", + "selector": "h3.heading-50", + "type": "text", + }, + { + "name": "section_description", + "selector": ".charge-content", + "type": "text", + }, + { + "name": "course_name", + "selector": ".text-block-93", + "type": "text", + }, + { + "name": "course_description", + "selector": ".course-content-text", + "type": "text", + }, + { + "name": "course_icon", + "selector": ".image-92", + "type": "attribute", + "attribute": "src" + } + ] + } + + browser_config = BrowserConfig( + headless=True, + java_script_enabled=True + ) + + js_click_tabs = """ + (async () => { + const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div"); + for(let tab of tabs) { + tab.scrollIntoView(); + tab.click(); + await new Promise(r => setTimeout(r, 500)); + } + })(); + """ + + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=JsonCssExtractionStrategy(schema), + js_code=[js_click_tabs] + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://www.kidocode.com/degrees/technology", + config=crawler_config + ) + + companies = json.loads(result.extracted_content) + print(f"Successfully extracted {len(companies)} companies") + print(json.dumps(companies[0], indent=2)) + +# Dynamic Content Examples - Method 1 +async def crawl_dynamic_content_pages_method_1(): + print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---") + first_commit = "" + + async def on_execution_started(page, **kwargs): + nonlocal first_commit + try: + while True: + await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4") + commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4") + commit = await commit.evaluate("(element) => element.textContent") + commit = re.sub(r"\s+", "", commit) + if commit and commit != first_commit: + first_commit = commit + break + await asyncio.sleep(0.5) + except Exception as e: + print(f"Warning: New content didn't appear after JavaScript execution: {e}") + + browser_config = BrowserConfig( + headless=False, + java_script_enabled=True + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started) + + url = "https://github.com/microsoft/TypeScript/commits/main" + session_id = "typescript_commits_session" + all_commits = [] + + js_next_page = """ + const button = document.querySelector('a[data-testid="pagination-next-button"]'); + if (button) button.click(); + """ + + for page in range(3): + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + css_selector="li.Box-sc-g0xbh4-0", + js_code=js_next_page if page > 0 else None, + js_only=page > 0, + session_id=session_id + ) + + result = await crawler.arun(url=url, config=crawler_config) + assert result.success, f"Failed to crawl page {page + 1}" + + soup = BeautifulSoup(result.cleaned_html, "html.parser") + commits = soup.select("li") + all_commits.extend(commits) + + print(f"Page {page + 1}: Found {len(commits)} commits") + + print(f"Successfully crawled {len(all_commits)} commits across 3 pages") + +# Dynamic Content Examples - Method 2 +async def crawl_dynamic_content_pages_method_2(): + print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---") + + browser_config = BrowserConfig( + headless=False, + java_script_enabled=True + ) + + js_next_page_and_wait = """ + (async () => { + const getCurrentCommit = () => { + const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4'); + return commits.length > 0 ? commits[0].textContent.trim() : null; + }; + + const initialCommit = getCurrentCommit(); + const button = document.querySelector('a[data-testid="pagination-next-button"]'); + if (button) button.click(); + + while (true) { + await new Promise(resolve => setTimeout(resolve, 100)); + const newCommit = getCurrentCommit(); + if (newCommit && newCommit !== initialCommit) { + break; + } + } + })(); + """ + + schema = { + "name": "Commit Extractor", + "baseSelector": "li.Box-sc-g0xbh4-0", + "fields": [ + { + "name": "title", + "selector": "h4.markdown-title", + "type": "text", + "transform": "strip", + }, + ], + } + + async with AsyncWebCrawler(config=browser_config) as crawler: + url = "https://github.com/microsoft/TypeScript/commits/main" + session_id = "typescript_commits_session" + all_commits = [] + + extraction_strategy = JsonCssExtractionStrategy(schema) + + for page in range(3): + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + css_selector="li.Box-sc-g0xbh4-0", + extraction_strategy=extraction_strategy, + js_code=js_next_page_and_wait if page > 0 else None, + js_only=page > 0, + session_id=session_id + ) + + result = await crawler.arun(url=url, config=crawler_config) + assert result.success, f"Failed to crawl page {page + 1}" + + commits = json.loads(result.extracted_content) + all_commits.extend(commits) + print(f"Page {page + 1}: Found {len(commits)} commits") + + print(f"Successfully crawled {len(all_commits)} commits across 3 pages") + +# Browser Comparison +async def crawl_custom_browser_type(): + print("\n--- Browser Comparison ---") + + # Firefox + browser_config_firefox = BrowserConfig( + browser_type="firefox", + headless=True + ) + start = time.time() + async with AsyncWebCrawler(config=browser_config_firefox) as crawler: + result = await crawler.arun( + url="https://www.example.com", + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + print("Firefox:", time.time() - start) + print(result.markdown[:500]) + + # WebKit + browser_config_webkit = BrowserConfig( + browser_type="webkit", + headless=True + ) + start = time.time() + async with AsyncWebCrawler(config=browser_config_webkit) as crawler: + result = await crawler.arun( + url="https://www.example.com", + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + print("WebKit:", time.time() - start) + print(result.markdown[:500]) + + # Chromium (default) + browser_config_chromium = BrowserConfig( + browser_type="chromium", + headless=True + ) + start = time.time() + async with AsyncWebCrawler(config=browser_config_chromium) as crawler: + result = await crawler.arun( + url="https://www.example.com", + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + print("Chromium:", time.time() - start) + print(result.markdown[:500]) + +# Anti-Bot and User Simulation +async def crawl_with_user_simulation(): + browser_config = BrowserConfig( + headless=True, + user_agent_mode="random", + user_agent_generator_config={ + "device_type": "mobile", + "os_type": "android" + } + ) + + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + magic=True, + simulate_user=True, + override_navigator=True + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="YOUR-URL-HERE", + config=crawler_config + ) + print(result.markdown) + +# Speed Comparison +async def speed_comparison(): + print("\n--- Speed Comparison ---") + + # Firecrawl comparison + from firecrawl import FirecrawlApp + app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY']) + start = time.time() + scrape_status = app.scrape_url( + 'https://www.nbcnews.com/business', + params={'formats': ['markdown', 'html']} + ) + end = time.time() + print("Firecrawl:") + print(f"Time taken: {end - start:.2f} seconds") + print(f"Content length: {len(scrape_status['markdown'])} characters") + print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}") + print() + + # Crawl4AI comparisons + browser_config = BrowserConfig(headless=True) + + # Simple crawl + async with AsyncWebCrawler(config=browser_config) as crawler: + start = time.time() + result = await crawler.arun( + url="https://www.nbcnews.com/business", + config=CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + word_count_threshold=0 + ) + ) + end = time.time() + print("Crawl4AI (simple crawl):") + print(f"Time taken: {end - start:.2f} seconds") + print(f"Content length: {len(result.markdown)} characters") + print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") + print() + + # Advanced filtering + start = time.time() + result = await crawler.arun( + url="https://www.nbcnews.com/business", + config=CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + word_count_threshold=0, + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter( + threshold=0.48, + threshold_type="fixed", + min_word_threshold=0 + ) + ) + ) + ) + end = time.time() + print("Crawl4AI (Markdown Plus):") + print(f"Time taken: {end - start:.2f} seconds") + print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters") + print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters") + print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") + print() + +# Main execution +async def main(): + # Basic examples + # await simple_crawl() + # await simple_example_with_running_js_code() + # await simple_example_with_css_selector() + + # Advanced examples + # await extract_structured_data_using_css_extractor() + # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) + # await crawl_dynamic_content_pages_method_1() + # await crawl_dynamic_content_pages_method_2() + + # Browser comparisons + await crawl_custom_browser_type() + + # Performance testing + # await speed_comparison() + + # Screenshot example + await capture_and_save_screenshot( + "https://www.example.com", + os.path.join(__location__, "tmp/example_screenshot.jpg") + ) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/md_v2/basic/cache-modes.md b/docs/md_v2/basic/cache-modes.md index 04a4f218..01cfe34e 100644 --- a/docs/md_v2/basic/cache-modes.md +++ b/docs/md_v2/basic/cache-modes.md @@ -1,7 +1,7 @@ # Crawl4AI Cache System and Migration Guide ## Overview -Starting from version X.X.X, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable. +Starting from version 0.5.0, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable. ## Old vs New Approach diff --git a/tests/async/test_0.4.2_config_params.py b/tests/async/test_0.4.2_config_params.py new file mode 100644 index 00000000..28529ac2 --- /dev/null +++ b/tests/async/test_0.4.2_config_params.py @@ -0,0 +1,231 @@ +import os, sys +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + +import asyncio +from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +from crawl4ai.content_filter_strategy import PruningContentFilter +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai.chunking_strategy import RegexChunking +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +# Category 1: Browser Configuration Tests +async def test_browser_config_object(): + """Test the new BrowserConfig object with various browser settings""" + browser_config = BrowserConfig( + browser_type="chromium", + headless=False, + viewport_width=1920, + viewport_height=1080, + use_managed_browser=True, + user_agent_mode="random", + user_agent_generator_config={"device_type": "desktop", "os_type": "windows"} + ) + + async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler: + result = await crawler.arun('https://example.com', cache_mode=CacheMode.BYPASS) + assert result.success, "Browser config crawl failed" + assert len(result.html) > 0, "No HTML content retrieved" + +async def test_browser_performance_config(): + """Test browser configurations focused on performance""" + browser_config = BrowserConfig( + text_only=True, + light_mode=True, + extra_args=['--disable-gpu', '--disable-software-rasterizer'], + ignore_https_errors=True, + java_script_enabled=False + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun('https://example.com') + assert result.success, "Performance optimized crawl failed" + assert result.status_code == 200, "Unexpected status code" + +# Category 2: Content Processing Tests +async def test_content_extraction_config(): + """Test content extraction with various strategies""" + crawler_config = CrawlerRunConfig( + word_count_threshold=300, + extraction_strategy=JsonCssExtractionStrategy( + schema={ + "name": "article", + "baseSelector": "div", + "fields": [{ + "name": "title", + "selector": "h1", + "type": "text" + }] + } + ), + chunking_strategy=RegexChunking(), + content_filter=PruningContentFilter() + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + 'https://example.com/article', + config=crawler_config + ) + assert result.extracted_content is not None, "Content extraction failed" + assert 'title' in result.extracted_content, "Missing expected content field" + +# Category 3: Cache and Session Management Tests +async def test_cache_and_session_management(): + """Test different cache modes and session handling""" + browser_config = BrowserConfig(use_persistent_context=True) + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.WRITE_ONLY, + process_iframes=True, + remove_overlay_elements=True + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + # First request - should write to cache + result1 = await crawler.arun( + 'https://example.com', + config=crawler_config + ) + + # Second request - should use fresh fetch due to WRITE_ONLY mode + result2 = await crawler.arun( + 'https://example.com', + config=crawler_config + ) + + assert result1.success and result2.success, "Cache mode crawl failed" + assert result1.html == result2.html, "Inconsistent results between requests" + +# Category 4: Media Handling Tests +async def test_media_handling_config(): + """Test configurations related to media handling""" + # Get the base path for home directroy ~/.crawl4ai/downloads, make sure it exists + os.makedirs(os.path.expanduser("~/.crawl4ai/downloads"), exist_ok=True) + browser_config = BrowserConfig( + viewport_width=1920, + viewport_height=1080, + accept_downloads=True, + downloads_path= os.path.expanduser("~/.crawl4ai/downloads") + ) + crawler_config = CrawlerRunConfig( + screenshot=True, + pdf=True, + adjust_viewport_to_content=True, + wait_for_images=True, + screenshot_height_threshold=20000 + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + 'https://example.com', + config=crawler_config + ) + assert result.screenshot is not None, "Screenshot capture failed" + assert result.pdf is not None, "PDF generation failed" + +# Category 5: Anti-Bot and Site Interaction Tests +async def test_antibot_config(): + """Test configurations for handling anti-bot measures""" + crawler_config = CrawlerRunConfig( + simulate_user=True, + override_navigator=True, + magic=True, + wait_for="js:()=>document.querySelector('body')", + delay_before_return_html=1.0, + log_console=True, + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + 'https://example.com', + config=crawler_config + ) + assert result.success, "Anti-bot measure handling failed" + +# Category 6: Parallel Processing Tests +async def test_parallel_processing(): + """Test parallel processing capabilities""" + crawler_config = CrawlerRunConfig( + mean_delay=0.5, + max_range=1.0, + semaphore_count=5 + ) + + urls = [ + 'https://example.com/1', + 'https://example.com/2', + 'https://example.com/3' + ] + + async with AsyncWebCrawler() as crawler: + results = await crawler.arun_many( + urls, + config=crawler_config + ) + assert len(results) == len(urls), "Not all URLs were processed" + assert all(r.success for r in results), "Some parallel requests failed" + +# Category 7: Backwards Compatibility Tests +async def test_legacy_parameter_support(): + """Test that legacy parameters still work""" + async with AsyncWebCrawler( + headless=True, + browser_type="chromium", + viewport_width=1024, + viewport_height=768 + ) as crawler: + result = await crawler.arun( + 'https://example.com', + screenshot=True, + word_count_threshold=200, + bypass_cache=True, + css_selector=".main-content" + ) + assert result.success, "Legacy parameter support failed" + +# Category 8: Mixed Configuration Tests +async def test_mixed_config_usage(): + """Test mixing new config objects with legacy parameters""" + browser_config = BrowserConfig(headless=True) + crawler_config = CrawlerRunConfig(screenshot=True) + + async with AsyncWebCrawler( + config=browser_config, + verbose=True # legacy parameter + ) as crawler: + result = await crawler.arun( + 'https://example.com', + config=crawler_config, + cache_mode=CacheMode.BYPASS, # legacy parameter + css_selector="body" # legacy parameter + ) + assert result.success, "Mixed configuration usage failed" + +if __name__ == "__main__": + async def run_tests(): + test_functions = [ + test_browser_config_object, + # test_browser_performance_config, + # test_content_extraction_config, + # test_cache_and_session_management, + # test_media_handling_config, + # test_antibot_config, + # test_parallel_processing, + # test_legacy_parameter_support, + # test_mixed_config_usage + ] + + for test in test_functions: + print(f"\nRunning {test.__name__}...") + try: + await test() + print(f"✓ {test.__name__} passed") + except AssertionError as e: + print(f"✗ {test.__name__} failed: {str(e)}") + except Exception as e: + print(f"✗ {test.__name__} error: {str(e)}") + + asyncio.run(run_tests()) \ No newline at end of file From de1766d565daf8eeb73f0315718b73ab3bd9748e Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 12 Dec 2024 19:35:30 +0800 Subject: [PATCH 64/70] Bump version to 0.4.2 --- crawl4ai/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 80861132..ee26a705 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.1" +__version__ = "0.4.2" From 3d69715dbabd2099fd64d7489b997b528b9c3fce Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 12 Dec 2024 19:57:59 +0800 Subject: [PATCH 65/70] chore: Update .gitignore to include new files and directories --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 52e25a2a..1d3429e3 100644 --- a/.gitignore +++ b/.gitignore @@ -206,6 +206,7 @@ pypi_build.sh git_issues.py git_issues.md +.next .tests/ .issues/ .docs/ @@ -214,4 +215,6 @@ git_issues.md todo_executor.md protect-all-except-feature.sh manage-collab.sh -publish.sh \ No newline at end of file +publish.sh +combine.sh +combined_output.txt \ No newline at end of file From 4a72c5ea6e2f39e56747a5936d99bf0ece890ce3 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 12 Dec 2024 20:15:50 +0800 Subject: [PATCH 66/70] Add release notes and documentation for version 0.4.2: Configurable Crawlers, Session Management, and Enhanced Screenshot/PDF features --- docs/md_v2/blog/index.md | 10 ++++ docs/md_v2/blog/releases/0.4.2.md | 86 +++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 docs/md_v2/blog/releases/0.4.2.md diff --git a/docs/md_v2/blog/index.md b/docs/md_v2/blog/index.md index 28ccfa6b..f7c8494d 100644 --- a/docs/md_v2/blog/index.md +++ b/docs/md_v2/blog/index.md @@ -4,6 +4,15 @@ Welcome to the Crawl4AI blog! Here you'll find detailed release notes, technical ## Latest Release +### [0.4.2 - Configurable Crawlers, Session Management, and Smarter Screenshots](releases/0.4.2.md) +*December 12, 2024* + +The 0.4.2 update brings massive improvements to configuration, making crawlers and browsers easier to manage with dedicated objects. You can now import/export local storage for seamless session management. Plus, long-page screenshots are faster and cleaner, and full-page PDF exports are now possible. Check out all the new features to make your crawling experience even smoother. + +[Read full release notes →](releases/0.4.2.md) + +--- + ### [0.4.1 - Smarter Crawling with Lazy-Load Handling, Text-Only Mode, and More](releases/0.4.1.md) *December 8, 2024* @@ -35,3 +44,4 @@ Curious about how Crawl4AI has evolved? Check out our [complete changelog](https - Star us on [GitHub](https://github.com/unclecode/crawl4ai) - Follow [@unclecode](https://twitter.com/unclecode) on Twitter - Join our community discussions on GitHub + diff --git a/docs/md_v2/blog/releases/0.4.2.md b/docs/md_v2/blog/releases/0.4.2.md new file mode 100644 index 00000000..6f8f39e9 --- /dev/null +++ b/docs/md_v2/blog/releases/0.4.2.md @@ -0,0 +1,86 @@ +## 🚀 Crawl4AI 0.4.2 Update: Smarter Crawling Just Got Easier (Dec 12, 2024) + +### Hey Developers, + +I’m excited to share Crawl4AI 0.4.2—a major upgrade that makes crawling smarter, faster, and a whole lot more intuitive. I’ve packed in a bunch of new features to simplify your workflows and improve your experience. Let’s cut to the chase! + +--- + +### 🔧 **Configurable Browser and Crawler Behavior** + +You’ve asked for better control over how browsers and crawlers are configured, and now you’ve got it. With the new `BrowserConfig` and `CrawlerRunConfig` objects, you can set up your browser and crawling behavior exactly how you want. No more cluttering `arun` with a dozen arguments—just pass in your configs and go. + +**Example:** +```python +from crawl4ai import BrowserConfig, CrawlerRunConfig, AsyncWebCrawler + +browser_config = BrowserConfig(headless=True, viewport_width=1920, viewport_height=1080) +crawler_config = CrawlerRunConfig(cache_mode="BYPASS") + +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com", config=crawler_config) + print(result.markdown[:500]) +``` + +This setup is a game-changer for scalability, keeping your code clean and flexible as we add more parameters in the future. + +Remember: If you like to use the old way, you can still pass arguments directly to `arun` as before, no worries! + +--- + +### 🔐 **Streamlined Session Management** + +Here’s the big one: You can now pass local storage and cookies directly. Whether it’s setting values programmatically or importing a saved JSON state, managing sessions has never been easier. This is a must-have for authenticated crawls—just export your storage state once and reuse it effortlessly across runs. + +**Example:** +1. Open a browser, log in manually, and export the storage state. +2. Import the JSON file for seamless authenticated crawling: + +```python +result = await crawler.arun( + url="https://example.com/protected", + storage_state="my_storage_state.json" +) +``` + +--- + +### 🔢 **Handling Large Pages: Supercharged Screenshots and PDF Conversion** + +Two big upgrades here: + +- **Blazing-fast long-page screenshots**: Turn extremely long web pages into clean, high-quality screenshots—without breaking a sweat. It’s optimized to handle large content without lag. + +- **Full-page PDF exports**: Now, you can also convert any page into a PDF with all the details intact. Perfect for archiving or sharing complex layouts. + +--- + +### 🔧 **Other Cool Stuff** + +- **Anti-bot enhancements**: Magic mode now handles overlays, user simulation, and anti-detection features like a pro. +- **JavaScript execution**: Execute custom JS snippets to handle dynamic content. No more wrestling with endless page interactions. + +--- + +### 📊 **Performance Boosts and Dev-friendly Updates** + +- Faster rendering and viewport adjustments for better performance. +- Improved cookie and local storage handling for seamless authentication. +- Better debugging with detailed logs and actionable error messages. + +--- + +### 🔠 **Use Cases You’ll Love** + +1. **Authenticated Crawls**: Login once, export your storage state, and reuse it across multiple requests without the headache. +2. **Long-page Screenshots**: Perfect for blogs, e-commerce pages, or any endless-scroll website. +3. **PDF Export**: Create professional-looking page PDFs in seconds. + +--- + +### Let’s Get Crawling + +Crawl4AI 0.4.2 is ready for you to download and try. I’m always looking for ways to improve, so don’t hold back—share your thoughts and feedback. + +Happy Crawling! 🚀 + From 7af1d32ef6734a63e47c6b585e7fc1511e124c2d Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 12 Dec 2024 20:18:44 +0800 Subject: [PATCH 67/70] Update README for version 0.4.2: Reflect new features and enhancements --- README.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 095c595c..36ee81a9 100644 --- a/README.md +++ b/README.md @@ -11,9 +11,9 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -[✨ Check out latest update v0.4.1](#-recent-updates) +[✨ Check out latest update v0.4.2](#-recent-updates) -🎉 **Version 0.4.x is out!** Introducing our experimental PruningContentFilter - a powerful new algorithm for smarter Markdown generation. Test it out and [share your feedback](https://github.com/unclecode/crawl4ai/issues)! [Read the release notes →](https://crawl4ai.com/mkdocs/blog) +🎉 **Version 0.4.2 is out!** Introducing our experimental PruningContentFilter - a powerful new algorithm for smarter Markdown generation. Test it out and [share your feedback](https://github.com/unclecode/crawl4ai/issues)! [Read the release notes →](https://crawl4ai.com/mkdocs/blog) ## 🧐 Why Crawl4AI? @@ -626,6 +626,10 @@ async def test_news_crawl(): ## ✨ Recent Updates +- 🔧 **Configurable Crawlers and Browsers**: Simplified crawling with `BrowserConfig` and `CrawlerRunConfig`, making setups cleaner and more scalable. +- 🔐 **Session Management Enhancements**: Import/export local storage for personalized crawling with seamless session reuse. +- 📸 **Supercharged Screenshots**: Take lightning-fast, full-page screenshots of very long pages. +- 📜 **Full-Page PDF Export**: Convert any web page into a PDF for easy sharing or archiving. - 🖼️ **Lazy Load Handling**: Improved support for websites with lazy-loaded images. The crawler now waits for all images to fully load, ensuring no content is missed. - ⚡ **Text-Only Mode**: New mode for fast, lightweight crawling. Disables images, JavaScript, and GPU rendering, improving speed by 3-4x for text-focused crawls. - 📐 **Dynamic Viewport Adjustment**: Automatically adjusts the browser viewport to fit page content, ensuring accurate rendering and capturing of all elements. @@ -633,7 +637,8 @@ async def test_news_crawl(): - 🧑‍💻 **Session Reuse**: Introduced `create_session` for efficient crawling by reusing the same browser session across multiple requests. - 🌟 **Light Mode**: Optimized browser performance by disabling unnecessary features like extensions, background timers, and sync processes. -Read the full details of this release in our [0.4.1 Release Notes](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/blog/releases/0.4.1.md). + +Read the full details of this release in our [0.4.2 Release Notes](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/blog/releases/0.4.2.md). ## 📖 Documentation & Roadmap From 7524aa7b5ec2dc2973719f4d68386ea2b9134168 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 13 Dec 2024 21:51:38 +0800 Subject: [PATCH 68/70] Feature: Add Markdown generation to CrawlerRunConfig - Added markdown generator parameter to CrawlerRunConfig in `async_configs.py`. - Implemented logic for Markdown generation in content scraping in `async_webcrawler.py`. - Updated version number to 0.4.21 in `__version__.py`. --- crawl4ai/__version__.py | 2 +- crawl4ai/async_configs.py | 4 ++++ crawl4ai/async_webcrawler.py | 32 ++++++++++++++++++--------- crawl4ai/content_scraping_strategy.py | 16 +++++++------- docs/examples/quickstart_async.py | 20 +++++++++-------- 5 files changed, 46 insertions(+), 28 deletions(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index ee26a705..38b432bd 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.2" +__version__ = "0.4.21" diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 41574fe6..aa0b849e 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -7,6 +7,7 @@ from .config import ( from .user_agent_generator import UserAgentGenerator from .extraction_strategy import ExtractionStrategy from .chunking_strategy import ChunkingStrategy +from .markdown_generation_strategy import MarkdownGenerationStrategy class BrowserConfig: """ @@ -269,6 +270,7 @@ class CrawlerRunConfig: word_count_threshold: int = MIN_WORD_THRESHOLD , extraction_strategy : ExtractionStrategy=None, # Will default to NoExtractionStrategy if None chunking_strategy : ChunkingStrategy= None, # Will default to RegexChunking if None + markdown_generator : MarkdownGenerationStrategy = None, content_filter=None, cache_mode=None, session_id: str = None, @@ -309,6 +311,7 @@ class CrawlerRunConfig: self.word_count_threshold = word_count_threshold self.extraction_strategy = extraction_strategy self.chunking_strategy = chunking_strategy + self.markdown_generator = markdown_generator self.content_filter = content_filter self.cache_mode = cache_mode self.session_id = session_id @@ -364,6 +367,7 @@ class CrawlerRunConfig: word_count_threshold=kwargs.get("word_count_threshold", 200), extraction_strategy=kwargs.get("extraction_strategy"), chunking_strategy=kwargs.get("chunking_strategy"), + markdown_generator=kwargs.get("markdown_generator"), content_filter=kwargs.get("content_filter"), cache_mode=kwargs.get("cache_mode"), session_id=kwargs.get("session_id"), diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 8515a387..9b968158 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -7,7 +7,8 @@ from pathlib import Path from typing import Optional, List, Union import json import asyncio -from contextlib import nullcontext, asynccontextmanager +# from contextlib import nullcontext, asynccontextmanager +from contextlib import asynccontextmanager from .models import CrawlResult, MarkdownGenerationResult from .async_database import async_db_manager from .chunking_strategy import * @@ -15,6 +16,7 @@ from .content_filter_strategy import * from .extraction_strategy import * from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode +from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy from .content_scraping_strategy import WebScrapingStrategy from .async_logger import AsyncLogger from .async_configs import BrowserConfig, CrawlerRunConfig @@ -132,17 +134,12 @@ class AsyncWebCrawler: async def __aexit__(self, exc_type, exc_val, exc_tb): await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb) - - @asynccontextmanager - async def nullcontext(self): - yield async def awarmup(self): """Initialize the crawler with warm-up sequence.""" self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") self.ready = True - @asynccontextmanager async def nullcontext(self): """异步空上下文管理器""" @@ -323,7 +320,8 @@ class AsyncWebCrawler: config=config, # Pass the config object instead of individual parameters screenshot=screenshot_data, pdf_data=pdf_data, - verbose=config.verbose + verbose=config.verbose, + **kwargs ) # Set response data @@ -424,7 +422,8 @@ class AsyncWebCrawler: css_selector=config.css_selector, only_text=config.only_text, image_description_min_word_threshold=config.image_description_min_word_threshold, - content_filter=config.content_filter + content_filter=config.content_filter, + **kwargs ) if result is None: @@ -435,16 +434,29 @@ class AsyncWebCrawler: except Exception as e: raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") + + # Extract results - markdown_v2 = result.get("markdown_v2", None) cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) - markdown = sanitize_input_encode(result.get("markdown", "")) fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) fit_html = sanitize_input_encode(result.get("fit_html", "")) media = result.get("media", []) links = result.get("links", []) metadata = result.get("metadata", {}) + # Markdown Generation + markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator() + if not config.content_filter and not markdown_generator.content_filter: + markdown_generator.content_filter = PruningContentFilter() + + markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( + cleaned_html=cleaned_html, + base_url=url, + # html2text_options=kwargs.get('html2text', {}) + ) + markdown_v2 = markdown_result + markdown = sanitize_input_encode(markdown_result.raw_markdown) + # Log processing completion self.logger.info( message="Processed {url:.50}... | Time: {timing}ms", diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index f58e1eac..4ba9a605 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -602,16 +602,16 @@ class WebScrapingStrategy(ContentScrapingStrategy): cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ') - markdown_content = self._generate_markdown_content( - cleaned_html=cleaned_html, - html=html, - url=url, - success=success, - **kwargs - ) + # markdown_content = self._generate_markdown_content( + # cleaned_html=cleaned_html, + # html=html, + # url=url, + # success=success, + # **kwargs + # ) return { - **markdown_content, + # **markdown_content, 'cleaned_html': cleaned_html, 'success': success, 'media': media, diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 1c76bf18..bd4c425f 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -239,8 +239,10 @@ async def crawl_dynamic_content_pages_method_1(): all_commits = [] js_next_page = """ - const button = document.querySelector('a[data-testid="pagination-next-button"]'); - if (button) button.click(); + (() => { + const button = document.querySelector('a[data-testid="pagination-next-button"]'); + if (button) button.click(); + })(); """ for page in range(3): # Crawl 3 pages @@ -604,14 +606,14 @@ async def fit_markdown_remove_overlay(): async def main(): - await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) + # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) - await simple_crawl() - await simple_example_with_running_js_code() - await simple_example_with_css_selector() - # await use_proxy() - await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) - await extract_structured_data_using_css_extractor() + # await simple_crawl() + # await simple_example_with_running_js_code() + # await simple_example_with_css_selector() + # # await use_proxy() + # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) + # await extract_structured_data_using_css_extractor() # LLM extraction examples # await extract_structured_data_using_llm() From e9e5b5642d8c4612c27a76880ea5670a78e2ab2b Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 15 Dec 2024 19:49:30 +0800 Subject: [PATCH 69/70] Fix js_snipprt issue 0.4.21 bump to 0.4.22 --- MANIFEST.in | 3 ++- docs/examples/quickstart_async.config.py | 13 +++++++------ setup.py | 3 +++ 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 540b7204..73a0e00b 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1,2 @@ -include requirements.txt \ No newline at end of file +include requirements.txt +recursive-include crawl4ai/js_snippet *.js \ No newline at end of file diff --git a/docs/examples/quickstart_async.config.py b/docs/examples/quickstart_async.config.py index e1feba55..ff312688 100644 --- a/docs/examples/quickstart_async.config.py +++ b/docs/examples/quickstart_async.config.py @@ -142,6 +142,7 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None crawler_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, word_count_threshold=1, + page_timeout = 80000, extraction_strategy=LLMExtractionStrategy( provider=provider, api_token=api_token, @@ -497,21 +498,21 @@ async def main(): # Advanced examples # await extract_structured_data_using_css_extractor() - # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) + await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) # await crawl_dynamic_content_pages_method_1() # await crawl_dynamic_content_pages_method_2() # Browser comparisons - await crawl_custom_browser_type() + # await crawl_custom_browser_type() # Performance testing # await speed_comparison() # Screenshot example - await capture_and_save_screenshot( - "https://www.example.com", - os.path.join(__location__, "tmp/example_screenshot.jpg") - ) + # await capture_and_save_screenshot( + # "https://www.example.com", + # os.path.join(__location__, "tmp/example_screenshot.jpg") + # ) if __name__ == "__main__": asyncio.run(main()) \ No newline at end of file diff --git a/setup.py b/setup.py index e6840cd0..bc6b00c2 100644 --- a/setup.py +++ b/setup.py @@ -57,6 +57,9 @@ setup( author_email="unclecode@kidocode.com", license="MIT", packages=find_packages(), + package_data={ + 'crawl4ai': ['js_snippet/*.js'] # This matches the exact path structure + }, install_requires=default_requirements + ["playwright", "aiofiles"], # Added aiofiles extras_require={ From ed7bc1909cdd49e23bc665d746f60a0b2fcaf594 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 15 Dec 2024 19:49:38 +0800 Subject: [PATCH 70/70] Bump version to 0.4.22 --- crawl4ai/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 38b432bd..c9c35576 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.21" +__version__ = "0.4.22"