Updated to version 0.4.0 with new features

- Enhanced error handling in async crawler.
  - Added flexible options in Markdown generation.
  - Updated user agent settings for improved reliability.
  - Reflected changes in documentation and examples.
This commit is contained in:
UncleCode
2024-12-04 20:26:39 +08:00
parent b02544bc0b
commit 486db3a771
5 changed files with 69 additions and 16 deletions

View File

@@ -1,2 +1,2 @@
# crawl4ai/_version.py # crawl4ai/_version.py
__version__ = "0.3.747" __version__ = "0.4.0"

View File

@@ -6,6 +6,7 @@ from typing import Callable, Dict, Any, List, Optional, Awaitable
import os, sys, shutil import os, sys, shutil
import tempfile, subprocess import tempfile, subprocess
from playwright.async_api import async_playwright, Page, Browser, Error from playwright.async_api import async_playwright, Page, Browser, Error
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
from io import BytesIO from io import BytesIO
from PIL import Image, ImageDraw, ImageFont from PIL import Image, ImageDraw, ImageFont
from pathlib import Path from pathlib import Path
@@ -223,6 +224,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
self.use_cached_html = use_cached_html self.use_cached_html = use_cached_html
self.user_agent = kwargs.get( self.user_agent = kwargs.get(
"user_agent", "user_agent",
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
"Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36" "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36"
) )
user_agenr_generator = UserAgentGenerator() user_agenr_generator = UserAgentGenerator()
@@ -941,11 +943,24 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
}); });
} }
""" """
try: try:
await page.wait_for_load_state() try:
await page.wait_for_load_state(
# state="load",
state="domcontentloaded",
timeout=5
)
except PlaywrightTimeoutError:
pass
await page.evaluate(update_image_dimensions_js) await page.evaluate(update_image_dimensions_js)
except Exception as e: except Exception as e:
raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}") self.logger.error(
message="Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {error}",
tag="ERROR",
params={"error": str(e)}
)
# raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}")
# Wait a bit for any onload events to complete # Wait a bit for any onload events to complete
await page.wait_for_timeout(100) await page.wait_for_timeout(100)

View File

@@ -11,8 +11,9 @@ LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')
class MarkdownGenerationStrategy(ABC): class MarkdownGenerationStrategy(ABC):
"""Abstract base class for markdown generation strategies.""" """Abstract base class for markdown generation strategies."""
def __init__(self, content_filter: Optional[RelevantContentFilter] = None): def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
self.content_filter = content_filter self.content_filter = content_filter
self.options = options or {}
@abstractmethod @abstractmethod
def generate_markdown(self, def generate_markdown(self,
@@ -27,8 +28,8 @@ class MarkdownGenerationStrategy(ABC):
class DefaultMarkdownGenerator(MarkdownGenerationStrategy): class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
"""Default implementation of markdown generation strategy.""" """Default implementation of markdown generation strategy."""
def __init__(self, content_filter: Optional[RelevantContentFilter] = None): def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
super().__init__(content_filter) super().__init__(content_filter, options)
def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]: def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]:
link_map = {} link_map = {}
@@ -74,6 +75,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
cleaned_html: str, cleaned_html: str,
base_url: str = "", base_url: str = "",
html2text_options: Optional[Dict[str, Any]] = None, html2text_options: Optional[Dict[str, Any]] = None,
options: Optional[Dict[str, Any]] = None,
content_filter: Optional[RelevantContentFilter] = None, content_filter: Optional[RelevantContentFilter] = None,
citations: bool = True, citations: bool = True,
**kwargs) -> MarkdownGenerationResult: **kwargs) -> MarkdownGenerationResult:
@@ -82,6 +84,10 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
h = CustomHTML2Text() h = CustomHTML2Text()
if html2text_options: if html2text_options:
h.update_params(**html2text_options) h.update_params(**html2text_options)
elif options:
h.update_params(**options)
elif self.options:
h.update_params(**self.options)
# Generate raw markdown # Generate raw markdown
raw_markdown = h.handle(cleaned_html) raw_markdown = h.handle(cleaned_html)

View File

@@ -236,6 +236,7 @@ class UserAgentGenerator:
# Example usage: # Example usage:
if __name__ == "__main__": if __name__ == "__main__":
generator = UserAgentGenerator() generator = UserAgentGenerator()
print(generator.generate())
print("\nSingle browser (Chrome):") print("\nSingle browser (Chrome):")
print(generator.generate(num_browsers=1, browser_type='chrome')) print(generator.generate(num_browsers=1, browser_type='chrome'))

View File

@@ -547,18 +547,49 @@ async def generate_knowledge_graph():
f.write(result.extracted_content) f.write(result.extracted_content)
async def fit_markdown_remove_overlay(): async def fit_markdown_remove_overlay():
async with AsyncWebCrawler(headless = False) as crawler: async with AsyncWebCrawler(
url = "https://janineintheworld.com/places-to-visit-in-central-mexico" headless=True, # Set to False to see what is happening
verbose=True,
user_agent_mode="random",
user_agent_generator_config={
"device_type": "mobile",
"os_type": "android"
},
) as crawler:
result = await crawler.arun( result = await crawler.arun(
url=url, url='https://www.kidocode.com/degrees/technology',
cache_mode=CacheMode.BYPASS, cache_mode=CacheMode.BYPASS,
word_count_threshold = 10, markdown_generator=DefaultMarkdownGenerator(
remove_overlay_elements=True, content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0),
screenshot = True options={
"ignore_links": True
}
),
# markdown_generator=DefaultMarkdownGenerator(
# content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0),
# options={
# "ignore_links": True
# }
# ),
) )
# Save markdown to file
with open(os.path.join(__location__, "mexico_places.md"), "w") as f: if result.success:
f.write(result.fit_markdown) print(len(result.markdown_v2.raw_markdown))
print(len(result.markdown_v2.markdown_with_citations))
print(len(result.markdown_v2.fit_markdown))
# Save clean html
with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f:
f.write(result.cleaned_html)
with open(os.path.join(__location__, "output/output_raw_markdown.md"), "w") as f:
f.write(result.markdown_v2.raw_markdown)
with open(os.path.join(__location__, "output/output_markdown_with_citations.md"), "w") as f:
f.write(result.markdown_v2.markdown_with_citations)
with open(os.path.join(__location__, "output/output_fit_markdown.md"), "w") as f:
f.write(result.markdown_v2.fit_markdown)
print("Done") print("Done")