perf(crawler): major performance improvements & raw HTML support
- Switch to lxml parser (~4x speedup) - Add raw HTML & local file crawling support - Fix cache headers & async cleanup - Add browser process monitoring - Optimize BeautifulSoup operations - Pre-compile regex patterns Breaking: Raw HTML handling requires new URL prefixes Fixes: #256, #253
This commit is contained in:
33
CHANGELOG.md
33
CHANGELOG.md
@@ -1,5 +1,30 @@
|
||||
# Changelog
|
||||
|
||||
# Changelog - November 13, 2024
|
||||
|
||||
### Added
|
||||
- Support for raw HTML and local file crawling via URL prefixes ('raw:', 'file://')
|
||||
- Browser process monitoring for managed browser instances
|
||||
- Screenshot capability for raw HTML and local file content
|
||||
- Response headers storage in cache database
|
||||
- New `fit_markdown` flag for optional markdown generation
|
||||
|
||||
### Changed
|
||||
- Switched HTML parser from 'html.parser' to 'lxml' for ~4x performance improvement
|
||||
- Optimized BeautifulSoup text conversion and element selection
|
||||
- Pre-compiled regular expressions for better performance
|
||||
- Improved metadata extraction efficiency
|
||||
- Response headers now stored alongside HTML in cache
|
||||
|
||||
### Removed
|
||||
- `__del__` method from AsyncPlaywrightCrawlerStrategy to prevent async cleanup issues
|
||||
|
||||
### Fixed
|
||||
- Issue #256: Added support for crawling raw HTML content
|
||||
- Issue #253: Implemented file:// protocol handling
|
||||
- Missing response headers in cached results
|
||||
- Memory leaks from improper async cleanup
|
||||
|
||||
## [v0.3.731] - 2024-11-13 Changelog for Issue 256 Fix
|
||||
- Fixed: Browser context unexpectedly closing in Docker environment during crawl operations.
|
||||
- Removed: __del__ method from AsyncPlaywrightCrawlerStrategy to prevent unreliable asynchronous cleanup, ensuring - browser context is closed explicitly within context managers.
|
||||
@@ -185,7 +210,7 @@ This commit introduces several key enhancements, including improved error handli
|
||||
## [v0.3.72] - 2024-10-20
|
||||
|
||||
### Fixed
|
||||
- Added support for parsing Base64 encoded images in WebScrappingStrategy
|
||||
- Added support for parsing Base64 encoded images in WebScrapingStrategy
|
||||
|
||||
### Added
|
||||
- Forked and integrated a customized version of the html2text library for more control over Markdown generation
|
||||
@@ -208,7 +233,7 @@ This commit introduces several key enhancements, including improved error handli
|
||||
### Developer Notes
|
||||
- The customized html2text library is now located within the crawl4ai package
|
||||
- New configuration options are available in the `config.py` file for external content handling
|
||||
- The `WebScrappingStrategy` class has been updated to accommodate new external content exclusion options
|
||||
- The `WebScrapingStrategy` class has been updated to accommodate new external content exclusion options
|
||||
|
||||
## [v0.3.71] - 2024-10-19
|
||||
|
||||
@@ -285,7 +310,7 @@ These updates aim to provide more flexibility in text processing, improve perfor
|
||||
|
||||
### Improvements
|
||||
1. **Better Error Handling**:
|
||||
- Enhanced error reporting in WebScrappingStrategy with detailed error messages and suggestions.
|
||||
- Enhanced error reporting in WebScrapingStrategy with detailed error messages and suggestions.
|
||||
- Added console message and error logging for better debugging.
|
||||
|
||||
2. **Image Processing Enhancements**:
|
||||
@@ -350,7 +375,7 @@ These updates aim to provide more flexibility in text processing, improve perfor
|
||||
- Allows for more customized setups.
|
||||
|
||||
### 2. Image Processing Optimization
|
||||
- Enhanced image handling in WebScrappingStrategy.
|
||||
- Enhanced image handling in WebScrapingStrategy.
|
||||
- Added filtering for small, invisible, or irrelevant images.
|
||||
- Improved image scoring system for better content relevance.
|
||||
- Implemented JavaScript-based image dimension updating for more accurate representation.
|
||||
|
||||
@@ -84,7 +84,7 @@ class ManagedBrowser:
|
||||
print(f"STDOUT: {stdout.decode()}")
|
||||
print(f"STDERR: {stderr.decode()}")
|
||||
await self.cleanup()
|
||||
|
||||
|
||||
def _get_browser_path(self) -> str:
|
||||
"""Returns the browser executable path based on OS and browser type"""
|
||||
if sys.platform == "darwin": # macOS
|
||||
@@ -493,6 +493,75 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
return page
|
||||
|
||||
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
||||
"""
|
||||
Crawls a given URL or processes raw HTML/local file content based on the URL prefix.
|
||||
|
||||
Args:
|
||||
url (str): The URL to crawl. Supported prefixes:
|
||||
- 'http://' or 'https://': Web URL to crawl.
|
||||
- 'file://': Local file path to process.
|
||||
- 'raw:': Raw HTML content to process.
|
||||
**kwargs: Additional parameters:
|
||||
- 'screenshot' (bool): Whether to take a screenshot.
|
||||
- ... [other existing parameters]
|
||||
|
||||
Returns:
|
||||
AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot.
|
||||
"""
|
||||
response_headers = {}
|
||||
status_code = 200 # Default to 200 for local/raw HTML
|
||||
screenshot_requested = kwargs.get('screenshot', False)
|
||||
screenshot_data = None
|
||||
|
||||
if url.startswith(('http://', 'https://')):
|
||||
# Proceed with standard web crawling
|
||||
return await self._crawl_web(url, **kwargs)
|
||||
|
||||
elif url.startswith('file://'):
|
||||
# Process local file
|
||||
local_file_path = url[7:] # Remove 'file://' prefix
|
||||
if not os.path.exists(local_file_path):
|
||||
raise FileNotFoundError(f"Local file not found: {local_file_path}")
|
||||
with open(local_file_path, 'r', encoding='utf-8') as f:
|
||||
html = f.read()
|
||||
if screenshot_requested:
|
||||
screenshot_data = await self._generate_screenshot_from_html(html)
|
||||
return AsyncCrawlResponse(
|
||||
html=html,
|
||||
response_headers=response_headers,
|
||||
status_code=status_code,
|
||||
screenshot=screenshot_data,
|
||||
get_delayed_content=None
|
||||
)
|
||||
|
||||
elif url.startswith('raw:'):
|
||||
# Process raw HTML content
|
||||
raw_html = url[4:] # Remove 'raw:' prefix
|
||||
html = raw_html
|
||||
if screenshot_requested:
|
||||
screenshot_data = await self._generate_screenshot_from_html(html)
|
||||
return AsyncCrawlResponse(
|
||||
html=html,
|
||||
response_headers=response_headers,
|
||||
status_code=status_code,
|
||||
screenshot=screenshot_data,
|
||||
get_delayed_content=None
|
||||
)
|
||||
else:
|
||||
raise ValueError("URL must start with 'http://', 'https://', 'file://', or 'raw:'")
|
||||
|
||||
|
||||
async def _crawl_web(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
||||
"""
|
||||
Existing web crawling logic remains unchanged.
|
||||
|
||||
Args:
|
||||
url (str): The web URL to crawl.
|
||||
**kwargs: Additional parameters.
|
||||
|
||||
Returns:
|
||||
AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot.
|
||||
"""
|
||||
response_headers = {}
|
||||
status_code = None
|
||||
|
||||
@@ -792,7 +861,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] ✅ Crawled {url} successfully!")
|
||||
|
||||
|
||||
if self.use_cached_html:
|
||||
cache_file_path = os.path.join(
|
||||
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
||||
@@ -972,6 +1041,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
print(f"Warning: Failed to remove overlay elements: {str(e)}")
|
||||
|
||||
async def take_screenshot(self, page: Page) -> str:
|
||||
"""
|
||||
Takes a screenshot of the current page.
|
||||
|
||||
Args:
|
||||
page (Page): The Playwright page instance
|
||||
|
||||
Returns:
|
||||
str: Base64-encoded screenshot image
|
||||
"""
|
||||
try:
|
||||
# The page is already loaded, just take the screenshot
|
||||
screenshot = await page.screenshot(full_page=True)
|
||||
@@ -991,4 +1069,36 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
async def _generate_screenshot_from_html(self, html: str) -> Optional[str]:
|
||||
"""
|
||||
Generates a screenshot from raw HTML content.
|
||||
|
||||
Args:
|
||||
html (str): The HTML content to render and capture.
|
||||
|
||||
Returns:
|
||||
Optional[str]: Base64-encoded screenshot image or an error image if failed.
|
||||
"""
|
||||
try:
|
||||
if not self.browser:
|
||||
await self.start()
|
||||
page = await self.browser.new_page()
|
||||
await page.set_content(html, wait_until='networkidle')
|
||||
screenshot = await page.screenshot(full_page=True)
|
||||
await page.close()
|
||||
return base64.b64encode(screenshot).decode('utf-8')
|
||||
except Exception as e:
|
||||
error_message = f"Failed to take screenshot: {str(e)}"
|
||||
print(error_message)
|
||||
|
||||
# Generate an error image
|
||||
img = Image.new('RGB', (800, 600), color='black')
|
||||
draw = ImageDraw.Draw(img)
|
||||
font = ImageFont.load_default()
|
||||
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
|
||||
|
||||
buffered = BytesIO()
|
||||
img.save(buffered, format="JPEG")
|
||||
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ import asyncio
|
||||
from typing import Optional, Tuple, Dict
|
||||
from contextlib import asynccontextmanager
|
||||
import logging
|
||||
import json # Added for serialization/deserialization
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
@@ -89,7 +90,8 @@ class AsyncDatabaseManager:
|
||||
media TEXT DEFAULT "{}",
|
||||
links TEXT DEFAULT "{}",
|
||||
metadata TEXT DEFAULT "{}",
|
||||
screenshot TEXT DEFAULT ""
|
||||
screenshot TEXT DEFAULT "",
|
||||
response_headers TEXT DEFAULT "{}" -- New column added
|
||||
)
|
||||
''')
|
||||
|
||||
@@ -105,26 +107,51 @@ class AsyncDatabaseManager:
|
||||
|
||||
column_names = await self.execute_with_retry(_check_columns)
|
||||
|
||||
for column in ['media', 'links', 'metadata', 'screenshot']:
|
||||
# List of new columns to add
|
||||
new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers']
|
||||
|
||||
for column in new_columns:
|
||||
if column not in column_names:
|
||||
await self.aalter_db_add_column(column)
|
||||
|
||||
async def aalter_db_add_column(self, new_column: str):
|
||||
"""Add new column to the database"""
|
||||
async def _alter(db):
|
||||
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
|
||||
if new_column == 'response_headers':
|
||||
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"')
|
||||
else:
|
||||
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
|
||||
logger.info(f"Added column '{new_column}' to the database.")
|
||||
|
||||
await self.execute_with_retry(_alter)
|
||||
|
||||
async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
|
||||
async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, bool, str, str, str, str]]:
|
||||
"""Retrieve cached URL data"""
|
||||
async def _get(db):
|
||||
async with db.execute(
|
||||
'SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?',
|
||||
'''
|
||||
SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers
|
||||
FROM crawled_data WHERE url = ?
|
||||
''',
|
||||
(url,)
|
||||
) as cursor:
|
||||
return await cursor.fetchone()
|
||||
row = await cursor.fetchone()
|
||||
if row:
|
||||
# Deserialize JSON fields
|
||||
return (
|
||||
row[0], # url
|
||||
row[1], # html
|
||||
row[2], # cleaned_html
|
||||
row[3], # markdown
|
||||
row[4], # extracted_content
|
||||
row[5], # success
|
||||
json.loads(row[6] or '{}'), # media
|
||||
json.loads(row[7] or '{}'), # links
|
||||
json.loads(row[8] or '{}'), # metadata
|
||||
row[9], # screenshot
|
||||
json.loads(row[10] or '{}') # response_headers
|
||||
)
|
||||
return None
|
||||
|
||||
try:
|
||||
return await self.execute_with_retry(_get)
|
||||
@@ -132,12 +159,27 @@ class AsyncDatabaseManager:
|
||||
logger.error(f"Error retrieving cached URL: {e}")
|
||||
return None
|
||||
|
||||
async def acache_url(self, url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media: str = "{}", links: str = "{}", metadata: str = "{}", screenshot: str = ""):
|
||||
async def acache_url(
|
||||
self,
|
||||
url: str,
|
||||
html: str,
|
||||
cleaned_html: str,
|
||||
markdown: str,
|
||||
extracted_content: str,
|
||||
success: bool,
|
||||
media: str = "{}",
|
||||
links: str = "{}",
|
||||
metadata: str = "{}",
|
||||
screenshot: str = "",
|
||||
response_headers: str = "{}" # New parameter added
|
||||
):
|
||||
"""Cache URL data with retry logic"""
|
||||
async def _cache(db):
|
||||
await db.execute('''
|
||||
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
INSERT INTO crawled_data (
|
||||
url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
html = excluded.html,
|
||||
cleaned_html = excluded.cleaned_html,
|
||||
@@ -147,8 +189,9 @@ class AsyncDatabaseManager:
|
||||
media = excluded.media,
|
||||
links = excluded.links,
|
||||
metadata = excluded.metadata,
|
||||
screenshot = excluded.screenshot
|
||||
''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot))
|
||||
screenshot = excluded.screenshot,
|
||||
response_headers = excluded.response_headers -- Update response_headers
|
||||
''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers))
|
||||
|
||||
try:
|
||||
await self.execute_with_retry(_cache)
|
||||
@@ -189,4 +232,4 @@ class AsyncDatabaseManager:
|
||||
logger.error(f"Error flushing database: {e}")
|
||||
|
||||
# Create a singleton instance
|
||||
async_db_manager = AsyncDatabaseManager()
|
||||
async_db_manager = AsyncDatabaseManager()
|
||||
|
||||
@@ -9,7 +9,7 @@ from .async_database import async_db_manager
|
||||
from .chunking_strategy import *
|
||||
from .extraction_strategy import *
|
||||
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
|
||||
from .content_scrapping_strategy import WebScrappingStrategy
|
||||
from .content_scrapping_strategy import WebScrapingStrategy
|
||||
from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
||||
from .utils import (
|
||||
sanitize_input_encode,
|
||||
@@ -47,17 +47,17 @@ class AsyncWebCrawler:
|
||||
|
||||
async def awarmup(self):
|
||||
# Print a message for crawl4ai and its version
|
||||
print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
|
||||
if self.verbose:
|
||||
print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
|
||||
print("[LOG] 🌤️ Warming up the AsyncWebCrawler")
|
||||
# await async_db_manager.ainit_db()
|
||||
await async_db_manager.initialize()
|
||||
# await self.arun(
|
||||
# url="https://google.com/",
|
||||
# word_count_threshold=5,
|
||||
# bypass_cache=False,
|
||||
# verbose=False,
|
||||
# )
|
||||
await self.arun(
|
||||
url="https://google.com/",
|
||||
word_count_threshold=5,
|
||||
bypass_cache=False,
|
||||
verbose=False,
|
||||
)
|
||||
self.ready = True
|
||||
if self.verbose:
|
||||
print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
|
||||
@@ -75,6 +75,19 @@ class AsyncWebCrawler:
|
||||
verbose=True,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
"""
|
||||
Runs the crawler for a single source: URL (web, local file, or raw HTML).
|
||||
|
||||
Args:
|
||||
url (str): The URL to crawl. Supported prefixes:
|
||||
- 'http://' or 'https://': Web URL to crawl.
|
||||
- 'file://': Local file path to process.
|
||||
- 'raw:': Raw HTML content to process.
|
||||
... [other existing parameters]
|
||||
|
||||
Returns:
|
||||
CrawlResult: The result of the crawling and processing.
|
||||
"""
|
||||
try:
|
||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||
extraction_strategy.verbose = verbose
|
||||
@@ -89,8 +102,13 @@ class AsyncWebCrawler:
|
||||
cached = None
|
||||
screenshot_data = None
|
||||
extracted_content = None
|
||||
if not bypass_cache and not self.always_by_pass_cache:
|
||||
|
||||
is_web_url = url.startswith(('http://', 'https://'))
|
||||
if is_web_url and not bypass_cache and not self.always_by_pass_cache:
|
||||
cached = await async_db_manager.aget_cached_url(url)
|
||||
|
||||
# if not bypass_cache and not self.always_by_pass_cache:
|
||||
# cached = await async_db_manager.aget_cached_url(url)
|
||||
|
||||
if kwargs.get("warmup", True) and not self.ready:
|
||||
return None
|
||||
@@ -117,25 +135,32 @@ class AsyncWebCrawler:
|
||||
)
|
||||
|
||||
crawl_result = await self.aprocess_html(
|
||||
url,
|
||||
html,
|
||||
extracted_content,
|
||||
word_count_threshold,
|
||||
extraction_strategy,
|
||||
chunking_strategy,
|
||||
css_selector,
|
||||
screenshot_data,
|
||||
verbose,
|
||||
bool(cached),
|
||||
url=url,
|
||||
html=html,
|
||||
extracted_content=extracted_content,
|
||||
word_count_threshold=word_count_threshold,
|
||||
extraction_strategy=extraction_strategy,
|
||||
chunking_strategy=chunking_strategy,
|
||||
css_selector=css_selector,
|
||||
screenshot=screenshot_data,
|
||||
verbose=verbose,
|
||||
is_cached=bool(cached),
|
||||
async_response=async_response,
|
||||
bypass_cache=bypass_cache,
|
||||
**kwargs,
|
||||
)
|
||||
crawl_result.status_code = async_response.status_code if async_response else 200
|
||||
crawl_result.response_headers = async_response.response_headers if async_response else {}
|
||||
|
||||
if async_response:
|
||||
crawl_result.status_code = async_response.status_code
|
||||
crawl_result.response_headers = async_response.response_headers
|
||||
else:
|
||||
crawl_result.status_code = 200
|
||||
crawl_result.response_headers = cached[10]
|
||||
|
||||
crawl_result.success = bool(html)
|
||||
crawl_result.session_id = kwargs.get("session_id", None)
|
||||
return crawl_result
|
||||
|
||||
except Exception as e:
|
||||
if not hasattr(e, "msg"):
|
||||
e.msg = str(e)
|
||||
@@ -155,22 +180,40 @@ class AsyncWebCrawler:
|
||||
verbose=True,
|
||||
**kwargs,
|
||||
) -> List[CrawlResult]:
|
||||
tasks = [
|
||||
self.arun(
|
||||
url,
|
||||
word_count_threshold,
|
||||
extraction_strategy,
|
||||
chunking_strategy,
|
||||
bypass_cache,
|
||||
css_selector,
|
||||
screenshot,
|
||||
user_agent,
|
||||
verbose,
|
||||
**kwargs
|
||||
)
|
||||
for url in urls
|
||||
]
|
||||
return await asyncio.gather(*tasks)
|
||||
"""
|
||||
Runs the crawler for multiple sources: URLs (web, local files, or raw HTML).
|
||||
|
||||
Args:
|
||||
urls (List[str]): A list of URLs with supported prefixes:
|
||||
- 'http://' or 'https://': Web URL to crawl.
|
||||
- 'file://': Local file path to process.
|
||||
- 'raw:': Raw HTML content to process.
|
||||
... [other existing parameters]
|
||||
|
||||
Returns:
|
||||
List[CrawlResult]: The results of the crawling and processing.
|
||||
"""
|
||||
semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed
|
||||
semaphore = asyncio.Semaphore(semaphore_count)
|
||||
|
||||
async def crawl_with_semaphore(url):
|
||||
async with semaphore:
|
||||
return await self.arun(
|
||||
url,
|
||||
word_count_threshold=word_count_threshold,
|
||||
extraction_strategy=extraction_strategy,
|
||||
chunking_strategy=chunking_strategy,
|
||||
bypass_cache=bypass_cache,
|
||||
css_selector=css_selector,
|
||||
screenshot=screenshot,
|
||||
user_agent=user_agent,
|
||||
verbose=verbose,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
tasks = [crawl_with_semaphore(url) for url in urls]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
return [result if not isinstance(result, Exception) else str(result) for result in results]
|
||||
|
||||
async def aprocess_html(
|
||||
self,
|
||||
@@ -184,13 +227,14 @@ class AsyncWebCrawler:
|
||||
screenshot: str,
|
||||
verbose: bool,
|
||||
is_cached: bool,
|
||||
async_response: Optional[AsyncCrawlResponse],
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
t = time.time()
|
||||
# Extract content from HTML
|
||||
try:
|
||||
t1 = time.time()
|
||||
scrapping_strategy = WebScrappingStrategy()
|
||||
scrapping_strategy = WebScrapingStrategy()
|
||||
# result = await scrapping_strategy.ascrap(
|
||||
result = scrapping_strategy.scrap(
|
||||
url,
|
||||
@@ -245,6 +289,12 @@ class AsyncWebCrawler:
|
||||
)
|
||||
|
||||
screenshot = None if not screenshot else screenshot
|
||||
|
||||
response_headers = "{}" # Default value
|
||||
if async_response:
|
||||
# Serialize response_headers dict to JSON string
|
||||
response_headers = json.dumps(async_response.response_headers, ensure_ascii=False)
|
||||
|
||||
|
||||
if not is_cached or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
|
||||
await async_db_manager.acache_url(
|
||||
@@ -258,6 +308,7 @@ class AsyncWebCrawler:
|
||||
json.dumps(links),
|
||||
json.dumps(metadata),
|
||||
screenshot=screenshot,
|
||||
response_headers=response_headers,
|
||||
)
|
||||
|
||||
return CrawlResult(
|
||||
|
||||
@@ -15,7 +15,7 @@ class ContentCleaningStrategy:
|
||||
self.link_density_threshold = 0.2
|
||||
self.max_dom_depth = 10 # To prevent excessive DOM traversal
|
||||
|
||||
def clean(self, clean_html: str) -> str:
|
||||
def clean(self, clean_html: str, soup = None) -> str:
|
||||
"""
|
||||
Main function that takes cleaned HTML and returns super cleaned HTML.
|
||||
|
||||
@@ -28,18 +28,20 @@ class ContentCleaningStrategy:
|
||||
try:
|
||||
if not clean_html or not isinstance(clean_html, str):
|
||||
return ''
|
||||
soup = BeautifulSoup(clean_html, 'html.parser')
|
||||
if not soup:
|
||||
# soup = BeautifulSoup(clean_html, 'html.parser')
|
||||
soup = BeautifulSoup(clean_html, 'lxml')
|
||||
main_content = self.extract_main_content(soup)
|
||||
if main_content:
|
||||
super_clean_element = self.clean_element(main_content)
|
||||
return str(super_clean_element)
|
||||
return super_clean_element.encode_contents().decode('utf-8')
|
||||
else:
|
||||
return ''
|
||||
except Exception:
|
||||
# Handle exceptions silently or log them as needed
|
||||
return ''
|
||||
|
||||
def extract_main_content(self, soup: BeautifulSoup) -> Optional[Tag]:
|
||||
def extract_main_content(self, soup) -> Optional[Tag]:
|
||||
"""
|
||||
Identifies and extracts the main content element from the HTML.
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import re # Point 1: Pre-Compile Regular Expressions
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Any
|
||||
from bs4 import BeautifulSoup
|
||||
@@ -105,7 +106,39 @@ class CustomHTML2Text(HTML2Text):
|
||||
return
|
||||
super().handle_data(data, entity_char)
|
||||
|
||||
class ContentScrappingStrategy(ABC):
|
||||
# Pre-compile regular expressions for Open Graph and Twitter metadata
|
||||
OG_REGEX = re.compile(r'^og:')
|
||||
TWITTER_REGEX = re.compile(r'^twitter:')
|
||||
DIMENSION_REGEX = re.compile(r"(\d+)(\D*)")
|
||||
|
||||
# Function to parse image height/width value and units
|
||||
def parse_dimension(dimension):
|
||||
if dimension:
|
||||
# match = re.match(r"(\d+)(\D*)", dimension)
|
||||
match = DIMENSION_REGEX.match(dimension)
|
||||
if match:
|
||||
number = int(match.group(1))
|
||||
unit = match.group(2) or 'px' # Default unit is 'px' if not specified
|
||||
return number, unit
|
||||
return None, None
|
||||
|
||||
# Fetch image file metadata to extract size and extension
|
||||
def fetch_image_file_size(img, base_url):
|
||||
#If src is relative path construct full URL, if not it may be CDN URL
|
||||
img_url = urljoin(base_url,img.get('src'))
|
||||
try:
|
||||
response = requests.head(img_url)
|
||||
if response.status_code == 200:
|
||||
return response.headers.get('Content-Length',None)
|
||||
else:
|
||||
print(f"Failed to retrieve file size for {img_url}")
|
||||
return None
|
||||
except InvalidSchema as e:
|
||||
return None
|
||||
finally:
|
||||
return
|
||||
|
||||
class ContentScrapingStrategy(ABC):
|
||||
@abstractmethod
|
||||
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||
pass
|
||||
@@ -114,7 +147,7 @@ class ContentScrappingStrategy(ABC):
|
||||
async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||
pass
|
||||
|
||||
class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||
return self._get_content_of_website_optimized(url, html, is_async=False, **kwargs)
|
||||
|
||||
@@ -126,9 +159,16 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
if not html:
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
# soup = BeautifulSoup(html, 'html.parser')
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
body = soup.body
|
||||
|
||||
try:
|
||||
meta = extract_metadata("", soup)
|
||||
except Exception as e:
|
||||
print('Error extracting metadata:', str(e))
|
||||
meta = {}
|
||||
|
||||
|
||||
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
|
||||
|
||||
@@ -187,31 +227,7 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
|
||||
#Score an image for it's usefulness
|
||||
def score_image_for_usefulness(img, base_url, index, images_count):
|
||||
# Function to parse image height/width value and units
|
||||
def parse_dimension(dimension):
|
||||
if dimension:
|
||||
match = re.match(r"(\d+)(\D*)", dimension)
|
||||
if match:
|
||||
number = int(match.group(1))
|
||||
unit = match.group(2) or 'px' # Default unit is 'px' if not specified
|
||||
return number, unit
|
||||
return None, None
|
||||
|
||||
# Fetch image file metadata to extract size and extension
|
||||
def fetch_image_file_size(img, base_url):
|
||||
#If src is relative path construct full URL, if not it may be CDN URL
|
||||
img_url = urljoin(base_url,img.get('src'))
|
||||
try:
|
||||
response = requests.head(img_url)
|
||||
if response.status_code == 200:
|
||||
return response.headers.get('Content-Length',None)
|
||||
else:
|
||||
print(f"Failed to retrieve file size for {img_url}")
|
||||
return None
|
||||
except InvalidSchema as e:
|
||||
return None
|
||||
finally:
|
||||
return
|
||||
|
||||
image_height = img.get('height')
|
||||
height_value, height_unit = parse_dimension(image_height)
|
||||
@@ -294,7 +310,6 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
|
||||
exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', [])
|
||||
exclude_social_media_domains = list(set(exclude_social_media_domains))
|
||||
|
||||
|
||||
try:
|
||||
if element.name == 'a' and element.get('href'):
|
||||
@@ -439,15 +454,7 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
except Exception as e:
|
||||
print('Error processing element:', str(e))
|
||||
return False
|
||||
|
||||
#process images by filtering and extracting contextual text from the page
|
||||
# imgs = body.find_all('img')
|
||||
# media['images'] = [
|
||||
# result for result in
|
||||
# (process_image(img, url, i, len(imgs)) for i, img in enumerate(imgs))
|
||||
# if result is not None
|
||||
# ]
|
||||
|
||||
|
||||
process_element(body)
|
||||
|
||||
# Update the links dictionary with unique links
|
||||
@@ -478,8 +485,9 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
# Replace base64 data with empty string
|
||||
img['src'] = base64_pattern.sub('', src)
|
||||
|
||||
str_body = ""
|
||||
try:
|
||||
str(body)
|
||||
str_body = body.encode_contents().decode('utf-8')
|
||||
except Exception as e:
|
||||
# Reset body to the original HTML
|
||||
success = False
|
||||
@@ -504,11 +512,12 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
|
||||
# Append the error div to the body
|
||||
body.body.append(error_div)
|
||||
str_body = body.encode_contents().decode('utf-8')
|
||||
|
||||
print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.")
|
||||
|
||||
|
||||
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
|
||||
cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ')
|
||||
|
||||
try:
|
||||
h = CustomHTML2Text()
|
||||
@@ -518,15 +527,14 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
markdown = h.handle(sanitize_html(cleaned_html))
|
||||
markdown = markdown.replace(' ```', '```')
|
||||
|
||||
try:
|
||||
meta = extract_metadata(html, soup)
|
||||
except Exception as e:
|
||||
print('Error extracting metadata:', str(e))
|
||||
meta = {}
|
||||
|
||||
|
||||
cleaner = ContentCleaningStrategy()
|
||||
fit_html = cleaner.clean(cleaned_html)
|
||||
fit_markdown = h.handle(fit_html)
|
||||
fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content."
|
||||
fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content."
|
||||
if kwargs.get('fit_markdown', False):
|
||||
cleaner = ContentCleaningStrategy()
|
||||
fit_html = cleaner.clean(cleaned_html)
|
||||
fit_markdown = h.handle(fit_html)
|
||||
|
||||
cleaned_html = sanitize_html(cleaned_html)
|
||||
return {
|
||||
|
||||
@@ -736,46 +736,54 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
||||
'metadata': meta
|
||||
}
|
||||
|
||||
def extract_metadata(html, soup = None):
|
||||
def extract_metadata(html, soup=None):
|
||||
metadata = {}
|
||||
|
||||
if not html:
|
||||
if not html and not soup:
|
||||
return {}
|
||||
|
||||
if not soup:
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
|
||||
head = soup.head
|
||||
if not head:
|
||||
return metadata
|
||||
|
||||
# Parse HTML content with BeautifulSoup
|
||||
if not soup:
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Title
|
||||
title_tag = soup.find('title')
|
||||
metadata['title'] = title_tag.string if title_tag else None
|
||||
title_tag = head.find('title')
|
||||
metadata['title'] = title_tag.string.strip() if title_tag and title_tag.string else None
|
||||
|
||||
# Meta description
|
||||
description_tag = soup.find('meta', attrs={'name': 'description'})
|
||||
metadata['description'] = description_tag['content'] if description_tag else None
|
||||
description_tag = head.find('meta', attrs={'name': 'description'})
|
||||
metadata['description'] = description_tag.get('content', '').strip() if description_tag else None
|
||||
|
||||
# Meta keywords
|
||||
keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
|
||||
metadata['keywords'] = keywords_tag['content'] if keywords_tag else None
|
||||
keywords_tag = head.find('meta', attrs={'name': 'keywords'})
|
||||
metadata['keywords'] = keywords_tag.get('content', '').strip() if keywords_tag else None
|
||||
|
||||
# Meta author
|
||||
author_tag = soup.find('meta', attrs={'name': 'author'})
|
||||
metadata['author'] = author_tag['content'] if author_tag else None
|
||||
author_tag = head.find('meta', attrs={'name': 'author'})
|
||||
metadata['author'] = author_tag.get('content', '').strip() if author_tag else None
|
||||
|
||||
# Open Graph metadata
|
||||
og_tags = soup.find_all('meta', attrs={'property': lambda value: value and value.startswith('og:')})
|
||||
og_tags = head.find_all('meta', attrs={'property': re.compile(r'^og:')})
|
||||
for tag in og_tags:
|
||||
property_name = tag['property']
|
||||
metadata[property_name] = tag['content']
|
||||
property_name = tag.get('property', '').strip()
|
||||
content = tag.get('content', '').strip()
|
||||
if property_name and content:
|
||||
metadata[property_name] = content
|
||||
|
||||
# Twitter Card metadata
|
||||
twitter_tags = soup.find_all('meta', attrs={'name': lambda value: value and value.startswith('twitter:')})
|
||||
twitter_tags = head.find_all('meta', attrs={'name': re.compile(r'^twitter:')})
|
||||
for tag in twitter_tags:
|
||||
property_name = tag['name']
|
||||
metadata[property_name] = tag['content']
|
||||
|
||||
property_name = tag.get('name', '').strip()
|
||||
content = tag.get('content', '').strip()
|
||||
if property_name and content:
|
||||
metadata[property_name] = content
|
||||
|
||||
return metadata
|
||||
|
||||
|
||||
def extract_xml_tags(string):
|
||||
tags = re.findall(r'<(\w+)>', string)
|
||||
return list(set(tags))
|
||||
|
||||
@@ -10,7 +10,7 @@ from .extraction_strategy import *
|
||||
from .crawler_strategy import *
|
||||
from typing import List
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from .content_scrapping_strategy import WebScrappingStrategy
|
||||
from .content_scrapping_strategy import WebScrapingStrategy
|
||||
from .config import *
|
||||
import warnings
|
||||
import json
|
||||
@@ -182,7 +182,7 @@ class WebCrawler:
|
||||
# Extract content from HTML
|
||||
try:
|
||||
t1 = time.time()
|
||||
scrapping_strategy = WebScrappingStrategy()
|
||||
scrapping_strategy = WebScrapingStrategy()
|
||||
extra_params = {k: v for k, v in kwargs.items() if k not in ["only_text", "image_description_min_word_threshold"]}
|
||||
result = scrapping_strategy.scrap(
|
||||
url,
|
||||
|
||||
235
docs/md_v2/basic/prefix-based-input.md
Normal file
235
docs/md_v2/basic/prefix-based-input.md
Normal file
@@ -0,0 +1,235 @@
|
||||
# Prefix-Based Input Handling in Crawl4AI
|
||||
|
||||
This guide will walk you through using the Crawl4AI library to crawl web pages, local HTML files, and raw HTML strings. We'll demonstrate these capabilities using a Wikipedia page as an example.
|
||||
|
||||
## Table of Contents
|
||||
- [Prefix-Based Input Handling in Crawl4AI](#prefix-based-input-handling-in-crawl4ai)
|
||||
- [Table of Contents](#table-of-contents)
|
||||
- [Crawling a Web URL](#crawling-a-web-url)
|
||||
- [Crawling a Local HTML File](#crawling-a-local-html-file)
|
||||
- [Crawling Raw HTML Content](#crawling-raw-html-content)
|
||||
- [Complete Example](#complete-example)
|
||||
- [**How It Works**](#how-it-works)
|
||||
- [**Running the Example**](#running-the-example)
|
||||
- [Conclusion](#conclusion)
|
||||
|
||||
---
|
||||
|
||||
|
||||
### Crawling a Web URL
|
||||
|
||||
To crawl a live web page, provide the URL starting with `http://` or `https://`.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def crawl_web():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(url="https://en.wikipedia.org/wiki/apple", bypass_cache=True)
|
||||
if result.success:
|
||||
print("Markdown Content:")
|
||||
print(result.markdown)
|
||||
else:
|
||||
print(f"Failed to crawl: {result.error_message}")
|
||||
|
||||
asyncio.run(crawl_web())
|
||||
```
|
||||
|
||||
### Crawling a Local HTML File
|
||||
|
||||
To crawl a local HTML file, prefix the file path with `file://`.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def crawl_local_file():
|
||||
local_file_path = "/path/to/apple.html" # Replace with your file path
|
||||
file_url = f"file://{local_file_path}"
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(url=file_url, bypass_cache=True)
|
||||
if result.success:
|
||||
print("Markdown Content from Local File:")
|
||||
print(result.markdown)
|
||||
else:
|
||||
print(f"Failed to crawl local file: {result.error_message}")
|
||||
|
||||
asyncio.run(crawl_local_file())
|
||||
```
|
||||
|
||||
### Crawling Raw HTML Content
|
||||
|
||||
To crawl raw HTML content, prefix the HTML string with `raw:`.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def crawl_raw_html():
|
||||
raw_html = "<html><body><h1>Hello, World!</h1></body></html>"
|
||||
raw_html_url = f"raw:{raw_html}"
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(url=raw_html_url, bypass_cache=True)
|
||||
if result.success:
|
||||
print("Markdown Content from Raw HTML:")
|
||||
print(result.markdown)
|
||||
else:
|
||||
print(f"Failed to crawl raw HTML: {result.error_message}")
|
||||
|
||||
asyncio.run(crawl_raw_html())
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Complete Example
|
||||
|
||||
Below is a comprehensive script that:
|
||||
1. **Crawls the Wikipedia page for "Apple".**
|
||||
2. **Saves the HTML content to a local file (`apple.html`).**
|
||||
3. **Crawls the local HTML file and verifies the markdown length matches the original crawl.**
|
||||
4. **Crawls the raw HTML content from the saved file and verifies consistency.**
|
||||
|
||||
```python
|
||||
import os
|
||||
import sys
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
|
||||
# Adjust the parent directory to include the crawl4ai module
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.append(parent_dir)
|
||||
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def main():
|
||||
# Define the URL to crawl
|
||||
wikipedia_url = "https://en.wikipedia.org/wiki/apple"
|
||||
|
||||
# Define the path to save the HTML file
|
||||
# Save the file in the same directory as the script
|
||||
script_dir = Path(__file__).parent
|
||||
html_file_path = script_dir / "apple.html"
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
print("\n=== Step 1: Crawling the Wikipedia URL ===")
|
||||
# Crawl the Wikipedia URL
|
||||
result = await crawler.arun(url=wikipedia_url, bypass_cache=True)
|
||||
|
||||
# Check if crawling was successful
|
||||
if not result.success:
|
||||
print(f"Failed to crawl {wikipedia_url}: {result.error_message}")
|
||||
return
|
||||
|
||||
# Save the HTML content to a local file
|
||||
with open(html_file_path, 'w', encoding='utf-8') as f:
|
||||
f.write(result.html)
|
||||
print(f"Saved HTML content to {html_file_path}")
|
||||
|
||||
# Store the length of the generated markdown
|
||||
web_crawl_length = len(result.markdown)
|
||||
print(f"Length of markdown from web crawl: {web_crawl_length}\n")
|
||||
|
||||
print("=== Step 2: Crawling from the Local HTML File ===")
|
||||
# Construct the file URL with 'file://' prefix
|
||||
file_url = f"file://{html_file_path.resolve()}"
|
||||
|
||||
# Crawl the local HTML file
|
||||
local_result = await crawler.arun(url=file_url, bypass_cache=True)
|
||||
|
||||
# Check if crawling was successful
|
||||
if not local_result.success:
|
||||
print(f"Failed to crawl local file {file_url}: {local_result.error_message}")
|
||||
return
|
||||
|
||||
# Store the length of the generated markdown from local file
|
||||
local_crawl_length = len(local_result.markdown)
|
||||
print(f"Length of markdown from local file crawl: {local_crawl_length}")
|
||||
|
||||
# Compare the lengths
|
||||
assert web_crawl_length == local_crawl_length, (
|
||||
f"Markdown length mismatch: Web crawl ({web_crawl_length}) != Local file crawl ({local_crawl_length})"
|
||||
)
|
||||
print("✅ Markdown length matches between web crawl and local file crawl.\n")
|
||||
|
||||
print("=== Step 3: Crawling Using Raw HTML Content ===")
|
||||
# Read the HTML content from the saved file
|
||||
with open(html_file_path, 'r', encoding='utf-8') as f:
|
||||
raw_html_content = f.read()
|
||||
|
||||
# Prefix the raw HTML content with 'raw:'
|
||||
raw_html_url = f"raw:{raw_html_content}"
|
||||
|
||||
# Crawl using the raw HTML content
|
||||
raw_result = await crawler.arun(url=raw_html_url, bypass_cache=True)
|
||||
|
||||
# Check if crawling was successful
|
||||
if not raw_result.success:
|
||||
print(f"Failed to crawl raw HTML content: {raw_result.error_message}")
|
||||
return
|
||||
|
||||
# Store the length of the generated markdown from raw HTML
|
||||
raw_crawl_length = len(raw_result.markdown)
|
||||
print(f"Length of markdown from raw HTML crawl: {raw_crawl_length}")
|
||||
|
||||
# Compare the lengths
|
||||
assert web_crawl_length == raw_crawl_length, (
|
||||
f"Markdown length mismatch: Web crawl ({web_crawl_length}) != Raw HTML crawl ({raw_crawl_length})"
|
||||
)
|
||||
print("✅ Markdown length matches between web crawl and raw HTML crawl.\n")
|
||||
|
||||
print("All tests passed successfully!")
|
||||
|
||||
# Clean up by removing the saved HTML file
|
||||
if html_file_path.exists():
|
||||
os.remove(html_file_path)
|
||||
print(f"Removed the saved HTML file: {html_file_path}")
|
||||
|
||||
# Run the main function
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### **How It Works**
|
||||
|
||||
1. **Step 1: Crawl the Web URL**
|
||||
- Crawls `https://en.wikipedia.org/wiki/apple`.
|
||||
- Saves the HTML content to `apple.html`.
|
||||
- Records the length of the generated markdown.
|
||||
|
||||
2. **Step 2: Crawl from the Local HTML File**
|
||||
- Uses the `file://` prefix to crawl `apple.html`.
|
||||
- Ensures the markdown length matches the original web crawl.
|
||||
|
||||
3. **Step 3: Crawl Using Raw HTML Content**
|
||||
- Reads the HTML from `apple.html`.
|
||||
- Prefixes it with `raw:` and crawls.
|
||||
- Verifies the markdown length matches the previous results.
|
||||
|
||||
4. **Cleanup**
|
||||
- Deletes the `apple.html` file after testing.
|
||||
|
||||
### **Running the Example**
|
||||
|
||||
1. **Save the Script:**
|
||||
- Save the above code as `test_crawl4ai.py` in your project directory.
|
||||
|
||||
2. **Execute the Script:**
|
||||
- Run the script using:
|
||||
```bash
|
||||
python test_crawl4ai.py
|
||||
```
|
||||
|
||||
3. **Observe the Output:**
|
||||
- The script will print logs detailing each step.
|
||||
- Assertions ensure consistency across different crawling methods.
|
||||
- Upon success, it confirms that all markdown lengths match.
|
||||
|
||||
---
|
||||
|
||||
## Conclusion
|
||||
|
||||
With the new prefix-based input handling in **Crawl4AI**, you can effortlessly crawl web URLs, local HTML files, and raw HTML strings using a unified `url` parameter. This enhancement simplifies the API usage and provides greater flexibility for diverse crawling scenarios.
|
||||
|
||||
2179
tests/async/sample_wikipedia.html
Normal file
2179
tests/async/sample_wikipedia.html
Normal file
File diff suppressed because one or more lines are too long
162
tests/async/test_content_scraper_strategy.py
Normal file
162
tests/async/test_content_scraper_strategy.py
Normal file
@@ -0,0 +1,162 @@
|
||||
import asyncio
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Dict, Any
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import csv
|
||||
from tabulate import tabulate
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Dict
|
||||
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
sys.path.append(parent_dir)
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
from crawl4ai.content_scrapping_strategy import WebScrapingStrategy
|
||||
from crawl4ai.content_scrapping_strategy import WebScrapingStrategy as WebScrapingStrategyCurrent
|
||||
# from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent
|
||||
|
||||
@dataclass
|
||||
class TestResult:
|
||||
name: str
|
||||
success: bool
|
||||
images: int
|
||||
internal_links: int
|
||||
external_links: int
|
||||
markdown_length: int
|
||||
execution_time: float
|
||||
|
||||
class StrategyTester:
|
||||
def __init__(self):
|
||||
self.new_scraper = WebScrapingStrategy()
|
||||
self.current_scraper = WebScrapingStrategyCurrent()
|
||||
with open(__location__ + '/sample_wikipedia.html', 'r', encoding='utf-8') as f:
|
||||
self.WIKI_HTML = f.read()
|
||||
self.results = {'new': [], 'current': []}
|
||||
|
||||
def run_test(self, name: str, **kwargs) -> tuple[TestResult, TestResult]:
|
||||
results = []
|
||||
for scraper in [self.new_scraper, self.current_scraper]:
|
||||
start_time = time.time()
|
||||
result = scraper._get_content_of_website_optimized(
|
||||
url="https://en.wikipedia.org/wiki/Test",
|
||||
html=self.WIKI_HTML,
|
||||
**kwargs
|
||||
)
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
test_result = TestResult(
|
||||
name=name,
|
||||
success=result['success'],
|
||||
images=len(result['media']['images']),
|
||||
internal_links=len(result['links']['internal']),
|
||||
external_links=len(result['links']['external']),
|
||||
markdown_length=len(result['markdown']),
|
||||
execution_time=execution_time
|
||||
)
|
||||
results.append(test_result)
|
||||
|
||||
return results[0], results[1] # new, current
|
||||
|
||||
def run_all_tests(self):
|
||||
test_cases = [
|
||||
("Basic Extraction", {}),
|
||||
("Exclude Tags", {'excluded_tags': ['table', 'div.infobox', 'div.navbox']}),
|
||||
("Word Threshold", {'word_count_threshold': 50}),
|
||||
("CSS Selector", {'css_selector': 'div.mw-parser-output > p'}),
|
||||
("Link Exclusions", {
|
||||
'exclude_external_links': True,
|
||||
'exclude_social_media_links': True,
|
||||
'exclude_domains': ['facebook.com', 'twitter.com']
|
||||
}),
|
||||
("Media Handling", {
|
||||
'exclude_external_images': True,
|
||||
'image_description_min_word_threshold': 20
|
||||
}),
|
||||
("Text Only", {
|
||||
'only_text': True,
|
||||
'remove_forms': True
|
||||
}),
|
||||
("HTML Cleaning", {
|
||||
'clean_html': True,
|
||||
'keep_data_attributes': True
|
||||
}),
|
||||
("HTML2Text Options", {
|
||||
'html2text': {
|
||||
'skip_internal_links': True,
|
||||
'single_line_break': True,
|
||||
'mark_code': True,
|
||||
'preserve_tags': ['pre', 'code']
|
||||
}
|
||||
})
|
||||
]
|
||||
|
||||
all_results = []
|
||||
for name, kwargs in test_cases:
|
||||
try:
|
||||
new_result, current_result = self.run_test(name, **kwargs)
|
||||
all_results.append((name, new_result, current_result))
|
||||
except Exception as e:
|
||||
print(f"Error in {name}: {str(e)}")
|
||||
|
||||
self.save_results_to_csv(all_results)
|
||||
self.print_comparison_table(all_results)
|
||||
|
||||
def save_results_to_csv(self, all_results: List[tuple]):
|
||||
csv_file = os.path.join(__location__, 'strategy_comparison_results.csv')
|
||||
with open(csv_file, 'w', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(['Test Name', 'Strategy', 'Success', 'Images', 'Internal Links',
|
||||
'External Links', 'Markdown Length', 'Execution Time'])
|
||||
|
||||
for name, new_result, current_result in all_results:
|
||||
writer.writerow([name, 'New', new_result.success, new_result.images,
|
||||
new_result.internal_links, new_result.external_links,
|
||||
new_result.markdown_length, f"{new_result.execution_time:.3f}"])
|
||||
writer.writerow([name, 'Current', current_result.success, current_result.images,
|
||||
current_result.internal_links, current_result.external_links,
|
||||
current_result.markdown_length, f"{current_result.execution_time:.3f}"])
|
||||
|
||||
def print_comparison_table(self, all_results: List[tuple]):
|
||||
table_data = []
|
||||
headers = ['Test Name', 'Strategy', 'Success', 'Images', 'Internal Links',
|
||||
'External Links', 'Markdown Length', 'Time (s)']
|
||||
|
||||
for name, new_result, current_result in all_results:
|
||||
# Check for differences
|
||||
differences = []
|
||||
if new_result.images != current_result.images: differences.append('images')
|
||||
if new_result.internal_links != current_result.internal_links: differences.append('internal_links')
|
||||
if new_result.external_links != current_result.external_links: differences.append('external_links')
|
||||
if new_result.markdown_length != current_result.markdown_length: differences.append('markdown')
|
||||
|
||||
# Add row for new strategy
|
||||
new_row = [
|
||||
name, 'New', new_result.success, new_result.images,
|
||||
new_result.internal_links, new_result.external_links,
|
||||
new_result.markdown_length, f"{new_result.execution_time:.3f}"
|
||||
]
|
||||
table_data.append(new_row)
|
||||
|
||||
# Add row for current strategy
|
||||
current_row = [
|
||||
'', 'Current', current_result.success, current_result.images,
|
||||
current_result.internal_links, current_result.external_links,
|
||||
current_result.markdown_length, f"{current_result.execution_time:.3f}"
|
||||
]
|
||||
table_data.append(current_row)
|
||||
|
||||
# Add difference summary if any
|
||||
if differences:
|
||||
table_data.append(['', '⚠️ Differences', ', '.join(differences), '', '', '', '', ''])
|
||||
|
||||
# Add empty row for better readability
|
||||
table_data.append([''] * len(headers))
|
||||
|
||||
print("\nStrategy Comparison Results:")
|
||||
print(tabulate(table_data, headers=headers, tablefmt='grid'))
|
||||
|
||||
if __name__ == "__main__":
|
||||
tester = StrategyTester()
|
||||
tester.run_all_tests()
|
||||
Reference in New Issue
Block a user