Merge branch 'vr0.4.246'
This commit is contained in:
@@ -1,2 +1,2 @@
|
|||||||
# crawl4ai/_version.py
|
# crawl4ai/_version.py
|
||||||
__version__ = "0.4.245"
|
__version__ = "0.4.246"
|
||||||
|
|||||||
@@ -246,7 +246,7 @@ class CrawlerRunConfig:
|
|||||||
wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
|
wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
|
||||||
Default: None.
|
Default: None.
|
||||||
wait_for_images (bool): If True, wait for images to load before extracting content.
|
wait_for_images (bool): If True, wait for images to load before extracting content.
|
||||||
Default: True.
|
Default: False.
|
||||||
delay_before_return_html (float): Delay in seconds before retrieving final HTML.
|
delay_before_return_html (float): Delay in seconds before retrieving final HTML.
|
||||||
Default: 0.1.
|
Default: 0.1.
|
||||||
mean_delay (float): Mean base delay between requests when calling arun_many.
|
mean_delay (float): Mean base delay between requests when calling arun_many.
|
||||||
@@ -345,7 +345,7 @@ class CrawlerRunConfig:
|
|||||||
wait_until: str = "domcontentloaded",
|
wait_until: str = "domcontentloaded",
|
||||||
page_timeout: int = PAGE_TIMEOUT,
|
page_timeout: int = PAGE_TIMEOUT,
|
||||||
wait_for: str = None,
|
wait_for: str = None,
|
||||||
wait_for_images: bool = True,
|
wait_for_images: bool = False,
|
||||||
delay_before_return_html: float = 0.1,
|
delay_before_return_html: float = 0.1,
|
||||||
mean_delay: float = 0.1,
|
mean_delay: float = 0.1,
|
||||||
max_range: float = 0.3,
|
max_range: float = 0.3,
|
||||||
@@ -503,7 +503,7 @@ class CrawlerRunConfig:
|
|||||||
wait_until=kwargs.get("wait_until", "domcontentloaded"),
|
wait_until=kwargs.get("wait_until", "domcontentloaded"),
|
||||||
page_timeout=kwargs.get("page_timeout", 60000),
|
page_timeout=kwargs.get("page_timeout", 60000),
|
||||||
wait_for=kwargs.get("wait_for"),
|
wait_for=kwargs.get("wait_for"),
|
||||||
wait_for_images=kwargs.get("wait_for_images", True),
|
wait_for_images=kwargs.get("wait_for_images", False),
|
||||||
delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
|
delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
|
||||||
mean_delay=kwargs.get("mean_delay", 0.1),
|
mean_delay=kwargs.get("mean_delay", 0.1),
|
||||||
max_range=kwargs.get("max_range", 0.3),
|
max_range=kwargs.get("max_range", 0.3),
|
||||||
|
|||||||
@@ -418,34 +418,30 @@ class AsyncWebCrawler:
|
|||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
# crawl_result.status_code = async_response.status_code
|
crawl_result.status_code = async_response.status_code
|
||||||
# crawl_result.response_headers = async_response.response_headers
|
crawl_result.response_headers = async_response.response_headers
|
||||||
# crawl_result.downloaded_files = async_response.downloaded_files
|
crawl_result.downloaded_files = async_response.downloaded_files
|
||||||
# crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate
|
crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate
|
||||||
# else:
|
|
||||||
# crawl_result.status_code = 200
|
|
||||||
# crawl_result.response_headers = cached_result.response_headers if cached_result else {}
|
|
||||||
# crawl_result.ssl_certificate = cached_result.ssl_certificate if cached_result else None # Add SSL certificate from cache
|
|
||||||
|
|
||||||
# # Check and set values from async_response to crawl_result
|
# # Check and set values from async_response to crawl_result
|
||||||
try:
|
# try:
|
||||||
for key in vars(async_response):
|
# for key in vars(async_response):
|
||||||
if hasattr(crawl_result, key):
|
# if hasattr(crawl_result, key):
|
||||||
value = getattr(async_response, key, None)
|
# value = getattr(async_response, key, None)
|
||||||
current_value = getattr(crawl_result, key, None)
|
# current_value = getattr(crawl_result, key, None)
|
||||||
if value is not None and not current_value:
|
# if value is not None and not current_value:
|
||||||
try:
|
# try:
|
||||||
setattr(crawl_result, key, value)
|
# setattr(crawl_result, key, value)
|
||||||
except Exception as e:
|
# except Exception as e:
|
||||||
self.logger.warning(
|
# self.logger.warning(
|
||||||
message=f"Failed to set attribute {key}: {str(e)}",
|
# message=f"Failed to set attribute {key}: {str(e)}",
|
||||||
tag="WARNING"
|
# tag="WARNING"
|
||||||
)
|
# )
|
||||||
except Exception as e:
|
# except Exception as e:
|
||||||
self.logger.warning(
|
# self.logger.warning(
|
||||||
message=f"Error copying response attributes: {str(e)}",
|
# message=f"Error copying response attributes: {str(e)}",
|
||||||
tag="WARNING"
|
# tag="WARNING"
|
||||||
)
|
# )
|
||||||
|
|
||||||
crawl_result.success = bool(html)
|
crawl_result.success = bool(html)
|
||||||
crawl_result.session_id = getattr(config, 'session_id', None)
|
crawl_result.session_id = getattr(config, 'session_id', None)
|
||||||
@@ -585,8 +581,10 @@ class AsyncWebCrawler:
|
|||||||
|
|
||||||
# Markdown Generation
|
# Markdown Generation
|
||||||
markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
|
markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
|
||||||
if not config.content_filter and not markdown_generator.content_filter:
|
|
||||||
markdown_generator.content_filter = PruningContentFilter()
|
# Uncomment if by default we want to use PruningContentFilter
|
||||||
|
# if not config.content_filter and not markdown_generator.content_filter:
|
||||||
|
# markdown_generator.content_filter = PruningContentFilter()
|
||||||
|
|
||||||
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
|
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
|
||||||
cleaned_html=cleaned_html,
|
cleaned_html=cleaned_html,
|
||||||
|
|||||||
@@ -143,41 +143,83 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
|||||||
Returns:
|
Returns:
|
||||||
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
|
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
|
||||||
"""
|
"""
|
||||||
# Initialize HTML2Text with options
|
try:
|
||||||
h = CustomHTML2Text()
|
# Initialize HTML2Text with default options for better conversion
|
||||||
if html2text_options:
|
h = CustomHTML2Text(baseurl=base_url)
|
||||||
h.update_params(**html2text_options)
|
default_options = {
|
||||||
elif options:
|
'body_width': 0, # Disable text wrapping
|
||||||
h.update_params(**options)
|
'ignore_emphasis': False,
|
||||||
elif self.options:
|
'ignore_links': False,
|
||||||
h.update_params(**self.options)
|
'ignore_images': False,
|
||||||
|
'protect_links': True,
|
||||||
|
'single_line_break': True,
|
||||||
|
'mark_code': True,
|
||||||
|
'escape_snob': False
|
||||||
|
}
|
||||||
|
|
||||||
|
# Update with custom options if provided
|
||||||
|
if html2text_options:
|
||||||
|
default_options.update(html2text_options)
|
||||||
|
elif options:
|
||||||
|
default_options.update(options)
|
||||||
|
elif self.options:
|
||||||
|
default_options.update(self.options)
|
||||||
|
|
||||||
|
h.update_params(**default_options)
|
||||||
|
|
||||||
# Generate raw markdown
|
# Ensure we have valid input
|
||||||
raw_markdown = h.handle(cleaned_html)
|
if not cleaned_html:
|
||||||
raw_markdown = raw_markdown.replace(' ```', '```')
|
cleaned_html = ""
|
||||||
|
elif not isinstance(cleaned_html, str):
|
||||||
|
cleaned_html = str(cleaned_html)
|
||||||
|
|
||||||
# Convert links to citations
|
# Generate raw markdown
|
||||||
markdown_with_citations: str = ""
|
try:
|
||||||
references_markdown: str = ""
|
raw_markdown = h.handle(cleaned_html)
|
||||||
if citations:
|
except Exception as e:
|
||||||
markdown_with_citations, references_markdown = self.convert_links_to_citations(
|
raw_markdown = f"Error converting HTML to markdown: {str(e)}"
|
||||||
raw_markdown, base_url
|
|
||||||
|
raw_markdown = raw_markdown.replace(' ```', '```')
|
||||||
|
|
||||||
|
# Convert links to citations
|
||||||
|
markdown_with_citations: str = raw_markdown
|
||||||
|
references_markdown: str = ""
|
||||||
|
if citations:
|
||||||
|
try:
|
||||||
|
markdown_with_citations, references_markdown = self.convert_links_to_citations(
|
||||||
|
raw_markdown, base_url
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
markdown_with_citations = raw_markdown
|
||||||
|
references_markdown = f"Error generating citations: {str(e)}"
|
||||||
|
|
||||||
|
# Generate fit markdown if content filter is provided
|
||||||
|
fit_markdown: Optional[str] = ""
|
||||||
|
filtered_html: Optional[str] = ""
|
||||||
|
if content_filter or self.content_filter:
|
||||||
|
try:
|
||||||
|
content_filter = content_filter or self.content_filter
|
||||||
|
filtered_html = content_filter.filter_content(cleaned_html)
|
||||||
|
filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html)
|
||||||
|
fit_markdown = h.handle(filtered_html)
|
||||||
|
except Exception as e:
|
||||||
|
fit_markdown = f"Error generating fit markdown: {str(e)}"
|
||||||
|
filtered_html = ""
|
||||||
|
|
||||||
|
return MarkdownGenerationResult(
|
||||||
|
raw_markdown=raw_markdown or "",
|
||||||
|
markdown_with_citations=markdown_with_citations or "",
|
||||||
|
references_markdown=references_markdown or "",
|
||||||
|
fit_markdown=fit_markdown or "",
|
||||||
|
fit_html=filtered_html or "",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
# If anything fails, return empty strings with error message
|
||||||
|
error_msg = f"Error in markdown generation: {str(e)}"
|
||||||
|
return MarkdownGenerationResult(
|
||||||
|
raw_markdown=error_msg,
|
||||||
|
markdown_with_citations=error_msg,
|
||||||
|
references_markdown="",
|
||||||
|
fit_markdown="",
|
||||||
|
fit_html="",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate fit markdown if content filter is provided
|
|
||||||
fit_markdown: Optional[str] = ""
|
|
||||||
filtered_html: Optional[str] = ""
|
|
||||||
if content_filter or self.content_filter:
|
|
||||||
content_filter = content_filter or self.content_filter
|
|
||||||
filtered_html = content_filter.filter_content(cleaned_html)
|
|
||||||
filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html)
|
|
||||||
fit_markdown = h.handle(filtered_html)
|
|
||||||
|
|
||||||
return MarkdownGenerationResult(
|
|
||||||
raw_markdown=raw_markdown,
|
|
||||||
markdown_with_citations=markdown_with_citations,
|
|
||||||
references_markdown=references_markdown,
|
|
||||||
fit_markdown=fit_markdown,
|
|
||||||
fit_html=filtered_html,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|||||||
25
docs/examples/hello_world.py
Normal file
25
docs/examples/hello_world.py
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
import os, sys
|
||||||
|
|
||||||
|
sys.path.append(
|
||||||
|
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
)
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import *
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
crawler_config = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
markdown_generator=DefaultMarkdownGenerator(
|
||||||
|
content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
result = await crawler.arun(
|
||||||
|
url="https://www.nbcnews.com/business",
|
||||||
|
config=crawler_config
|
||||||
|
)
|
||||||
|
print(result.markdown_v2.raw_markdown[:500])
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
Reference in New Issue
Block a user