Compare commits

...

10 Commits

Author SHA1 Message Date
UncleCode
da1bc0f7bf Update version file 2025-01-01 19:42:35 +08:00
UncleCode
aa4f92f458 refactor(crawler):
- Update hello_world example with proper content filtering
2025-01-01 19:39:42 +08:00
UncleCode
a96e05d4ae refactor(crawler): optimize response handling and default settings
- Set wait_for_images default to false for better performance
- Simplify response attribute copying in AsyncWebCrawler
- Update hello_world example with proper content filtering
2025-01-01 19:39:02 +08:00
UncleCode
4cb2a62551 Update README 2025-01-01 18:59:55 +08:00
UncleCode
5b4fad9e25 - Bump version to 0.4.244 2025-01-01 18:58:43 +08:00
UncleCode
ea0ac25f38 refactor(browser):
Update browser channel default to 'chromium' in BrowserConfig.from_args method
2025-01-01 18:58:15 +08:00
UncleCode
7688aca7d6 Update Version 2025-01-01 18:44:27 +08:00
UncleCode
a7215ad972 fix(browser): update default browser channel to chromium and simplify channel selection logic 2025-01-01 18:38:33 +08:00
UncleCode
318554e6bf Merge branch 'v0.4.243' 2025-01-01 18:11:15 +08:00
UncleCode
3e769a9c6c Fix issue in 0.4.24 walkthrough 2024-12-31 21:07:33 +08:00
6 changed files with 142 additions and 79 deletions

View File

@@ -11,7 +11,7 @@
[![Python Version](https://img.shields.io/pypi/pyversions/crawl4ai)](https://pypi.org/project/crawl4ai/) [![Python Version](https://img.shields.io/pypi/pyversions/crawl4ai)](https://pypi.org/project/crawl4ai/)
[![Downloads](https://static.pepy.tech/badge/crawl4ai/month)](https://pepy.tech/project/crawl4ai) [![Downloads](https://static.pepy.tech/badge/crawl4ai/month)](https://pepy.tech/project/crawl4ai)
[![Documentation Status](https://readthedocs.org/projects/crawl4ai/badge/?version=latest)](https://crawl4ai.readthedocs.io/) <!-- [![Documentation Status](https://readthedocs.org/projects/crawl4ai/badge/?version=latest)](https://crawl4ai.readthedocs.io/) -->
[![License](https://img.shields.io/github/license/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/blob/main/LICENSE) [![License](https://img.shields.io/github/license/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/blob/main/LICENSE)
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
[![Security: bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit) [![Security: bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit)

View File

@@ -1,2 +1,2 @@
# crawl4ai/_version.py # crawl4ai/_version.py
__version__ = "0.4.243" __version__ = "0.4.246"

View File

@@ -35,7 +35,9 @@ class BrowserConfig:
user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
temporary directory may be used. Default: None. temporary directory may be used. Default: None.
chrome_channel (str): The Chrome channel to launch (e.g., "chrome", "msedge"). Only applies if browser_type chrome_channel (str): The Chrome channel to launch (e.g., "chrome", "msedge"). Only applies if browser_type
is "chromium". Default: "chrome". is "chromium". Default: "chromium".
channel (str): The channel to launch (e.g., "chromium", "chrome", "msedge"). Only applies if browser_type
is "chromium". Default: "chromium".
proxy (str or None): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used. proxy (str or None): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used.
Default: None. Default: None.
proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}. proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
@@ -77,7 +79,8 @@ class BrowserConfig:
use_managed_browser: bool = False, use_managed_browser: bool = False,
use_persistent_context: bool = False, use_persistent_context: bool = False,
user_data_dir: str = None, user_data_dir: str = None,
chrome_channel: str = "chrome", chrome_channel: str = "chromium",
channel: str = "chromium",
proxy: str = None, proxy: str = None,
proxy_config: dict = None, proxy_config: dict = None,
viewport_width: int = 1080, viewport_width: int = 1080,
@@ -107,14 +110,8 @@ class BrowserConfig:
self.use_managed_browser = use_managed_browser self.use_managed_browser = use_managed_browser
self.use_persistent_context = use_persistent_context self.use_persistent_context = use_persistent_context
self.user_data_dir = user_data_dir self.user_data_dir = user_data_dir
if self.browser_type == "chromium": self.chrome_channel = chrome_channel or self.browser_type or "chromium"
self.chrome_channel = "chrome" self.channel = channel or self.browser_type or "chromium"
elif self.browser_type == "firefox":
self.chrome_channel = "firefox"
elif self.browser_type == "webkit":
self.chrome_channel = "webkit"
else:
self.chrome_channel = chrome_channel or "chrome"
self.proxy = proxy self.proxy = proxy
self.proxy_config = proxy_config self.proxy_config = proxy_config
self.viewport_width = viewport_width self.viewport_width = viewport_width
@@ -161,7 +158,8 @@ class BrowserConfig:
use_managed_browser=kwargs.get("use_managed_browser", False), use_managed_browser=kwargs.get("use_managed_browser", False),
use_persistent_context=kwargs.get("use_persistent_context", False), use_persistent_context=kwargs.get("use_persistent_context", False),
user_data_dir=kwargs.get("user_data_dir"), user_data_dir=kwargs.get("user_data_dir"),
chrome_channel=kwargs.get("chrome_channel", "chrome"), chrome_channel=kwargs.get("chrome_channel", "chromium"),
channel=kwargs.get("channel", "chromium"),
proxy=kwargs.get("proxy"), proxy=kwargs.get("proxy"),
proxy_config=kwargs.get("proxy_config"), proxy_config=kwargs.get("proxy_config"),
viewport_width=kwargs.get("viewport_width", 1080), viewport_width=kwargs.get("viewport_width", 1080),
@@ -248,7 +246,7 @@ class CrawlerRunConfig:
wait_for (str or None): A CSS selector or JS condition to wait for before extracting content. wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
Default: None. Default: None.
wait_for_images (bool): If True, wait for images to load before extracting content. wait_for_images (bool): If True, wait for images to load before extracting content.
Default: True. Default: False.
delay_before_return_html (float): Delay in seconds before retrieving final HTML. delay_before_return_html (float): Delay in seconds before retrieving final HTML.
Default: 0.1. Default: 0.1.
mean_delay (float): Mean base delay between requests when calling arun_many. mean_delay (float): Mean base delay between requests when calling arun_many.
@@ -347,7 +345,7 @@ class CrawlerRunConfig:
wait_until: str = "domcontentloaded", wait_until: str = "domcontentloaded",
page_timeout: int = PAGE_TIMEOUT, page_timeout: int = PAGE_TIMEOUT,
wait_for: str = None, wait_for: str = None,
wait_for_images: bool = True, wait_for_images: bool = False,
delay_before_return_html: float = 0.1, delay_before_return_html: float = 0.1,
mean_delay: float = 0.1, mean_delay: float = 0.1,
max_range: float = 0.3, max_range: float = 0.3,
@@ -505,7 +503,7 @@ class CrawlerRunConfig:
wait_until=kwargs.get("wait_until", "domcontentloaded"), wait_until=kwargs.get("wait_until", "domcontentloaded"),
page_timeout=kwargs.get("page_timeout", 60000), page_timeout=kwargs.get("page_timeout", 60000),
wait_for=kwargs.get("wait_for"), wait_for=kwargs.get("wait_for"),
wait_for_images=kwargs.get("wait_for_images", True), wait_for_images=kwargs.get("wait_for_images", False),
delay_before_return_html=kwargs.get("delay_before_return_html", 0.1), delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
mean_delay=kwargs.get("mean_delay", 0.1), mean_delay=kwargs.get("mean_delay", 0.1),
max_range=kwargs.get("max_range", 0.3), max_range=kwargs.get("max_range", 0.3),

View File

@@ -418,34 +418,30 @@ class AsyncWebCrawler:
**kwargs **kwargs
) )
# crawl_result.status_code = async_response.status_code crawl_result.status_code = async_response.status_code
# crawl_result.response_headers = async_response.response_headers crawl_result.response_headers = async_response.response_headers
# crawl_result.downloaded_files = async_response.downloaded_files crawl_result.downloaded_files = async_response.downloaded_files
# crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate
# else:
# crawl_result.status_code = 200
# crawl_result.response_headers = cached_result.response_headers if cached_result else {}
# crawl_result.ssl_certificate = cached_result.ssl_certificate if cached_result else None # Add SSL certificate from cache
# # Check and set values from async_response to crawl_result # # Check and set values from async_response to crawl_result
try: # try:
for key in vars(async_response): # for key in vars(async_response):
if hasattr(crawl_result, key): # if hasattr(crawl_result, key):
value = getattr(async_response, key, None) # value = getattr(async_response, key, None)
current_value = getattr(crawl_result, key, None) # current_value = getattr(crawl_result, key, None)
if value is not None and not current_value: # if value is not None and not current_value:
try: # try:
setattr(crawl_result, key, value) # setattr(crawl_result, key, value)
except Exception as e: # except Exception as e:
self.logger.warning( # self.logger.warning(
message=f"Failed to set attribute {key}: {str(e)}", # message=f"Failed to set attribute {key}: {str(e)}",
tag="WARNING" # tag="WARNING"
) # )
except Exception as e: # except Exception as e:
self.logger.warning( # self.logger.warning(
message=f"Error copying response attributes: {str(e)}", # message=f"Error copying response attributes: {str(e)}",
tag="WARNING" # tag="WARNING"
) # )
crawl_result.success = bool(html) crawl_result.success = bool(html)
crawl_result.session_id = getattr(config, 'session_id', None) crawl_result.session_id = getattr(config, 'session_id', None)
@@ -585,8 +581,10 @@ class AsyncWebCrawler:
# Markdown Generation # Markdown Generation
markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator() markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
if not config.content_filter and not markdown_generator.content_filter:
markdown_generator.content_filter = PruningContentFilter() # Uncomment if by default we want to use PruningContentFilter
# if not config.content_filter and not markdown_generator.content_filter:
# markdown_generator.content_filter = PruningContentFilter()
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
cleaned_html=cleaned_html, cleaned_html=cleaned_html,

View File

@@ -143,41 +143,83 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
Returns: Returns:
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown. MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
""" """
# Initialize HTML2Text with options try:
h = CustomHTML2Text() # Initialize HTML2Text with default options for better conversion
if html2text_options: h = CustomHTML2Text(baseurl=base_url)
h.update_params(**html2text_options) default_options = {
elif options: 'body_width': 0, # Disable text wrapping
h.update_params(**options) 'ignore_emphasis': False,
elif self.options: 'ignore_links': False,
h.update_params(**self.options) 'ignore_images': False,
'protect_links': True,
'single_line_break': True,
'mark_code': True,
'escape_snob': False
}
# Update with custom options if provided
if html2text_options:
default_options.update(html2text_options)
elif options:
default_options.update(options)
elif self.options:
default_options.update(self.options)
h.update_params(**default_options)
# Generate raw markdown # Ensure we have valid input
raw_markdown = h.handle(cleaned_html) if not cleaned_html:
raw_markdown = raw_markdown.replace(' ```', '```') cleaned_html = ""
elif not isinstance(cleaned_html, str):
cleaned_html = str(cleaned_html)
# Convert links to citations # Generate raw markdown
markdown_with_citations: str = "" try:
references_markdown: str = "" raw_markdown = h.handle(cleaned_html)
if citations: except Exception as e:
markdown_with_citations, references_markdown = self.convert_links_to_citations( raw_markdown = f"Error converting HTML to markdown: {str(e)}"
raw_markdown, base_url
raw_markdown = raw_markdown.replace(' ```', '```')
# Convert links to citations
markdown_with_citations: str = raw_markdown
references_markdown: str = ""
if citations:
try:
markdown_with_citations, references_markdown = self.convert_links_to_citations(
raw_markdown, base_url
)
except Exception as e:
markdown_with_citations = raw_markdown
references_markdown = f"Error generating citations: {str(e)}"
# Generate fit markdown if content filter is provided
fit_markdown: Optional[str] = ""
filtered_html: Optional[str] = ""
if content_filter or self.content_filter:
try:
content_filter = content_filter or self.content_filter
filtered_html = content_filter.filter_content(cleaned_html)
filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html)
fit_markdown = h.handle(filtered_html)
except Exception as e:
fit_markdown = f"Error generating fit markdown: {str(e)}"
filtered_html = ""
return MarkdownGenerationResult(
raw_markdown=raw_markdown or "",
markdown_with_citations=markdown_with_citations or "",
references_markdown=references_markdown or "",
fit_markdown=fit_markdown or "",
fit_html=filtered_html or "",
)
except Exception as e:
# If anything fails, return empty strings with error message
error_msg = f"Error in markdown generation: {str(e)}"
return MarkdownGenerationResult(
raw_markdown=error_msg,
markdown_with_citations=error_msg,
references_markdown="",
fit_markdown="",
fit_html="",
) )
# Generate fit markdown if content filter is provided
fit_markdown: Optional[str] = ""
filtered_html: Optional[str] = ""
if content_filter or self.content_filter:
content_filter = content_filter or self.content_filter
filtered_html = content_filter.filter_content(cleaned_html)
filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html)
fit_markdown = h.handle(filtered_html)
return MarkdownGenerationResult(
raw_markdown=raw_markdown,
markdown_with_citations=markdown_with_citations,
references_markdown=references_markdown,
fit_markdown=fit_markdown,
fit_html=filtered_html,
)

View File

@@ -0,0 +1,25 @@
import os, sys
sys.path.append(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
)
import asyncio
from crawl4ai import *
async def main():
async with AsyncWebCrawler() as crawler:
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
)
)
result = await crawler.arun(
url="https://www.nbcnews.com/business",
config=crawler_config
)
print(result.markdown_v2.raw_markdown[:500])
if __name__ == "__main__":
asyncio.run(main())