Updated to version 0.4.0 with new features
- Enhanced error handling in async crawler. - Added flexible options in Markdown generation. - Updated user agent settings for improved reliability. - Reflected changes in documentation and examples.
This commit is contained in:
@@ -1,2 +1,2 @@
|
|||||||
# crawl4ai/_version.py
|
# crawl4ai/_version.py
|
||||||
__version__ = "0.3.747"
|
__version__ = "0.4.0"
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ from typing import Callable, Dict, Any, List, Optional, Awaitable
|
|||||||
import os, sys, shutil
|
import os, sys, shutil
|
||||||
import tempfile, subprocess
|
import tempfile, subprocess
|
||||||
from playwright.async_api import async_playwright, Page, Browser, Error
|
from playwright.async_api import async_playwright, Page, Browser, Error
|
||||||
|
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from PIL import Image, ImageDraw, ImageFont
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -223,6 +224,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
self.use_cached_html = use_cached_html
|
self.use_cached_html = use_cached_html
|
||||||
self.user_agent = kwargs.get(
|
self.user_agent = kwargs.get(
|
||||||
"user_agent",
|
"user_agent",
|
||||||
|
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
|
||||||
"Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36"
|
"Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36"
|
||||||
)
|
)
|
||||||
user_agenr_generator = UserAgentGenerator()
|
user_agenr_generator = UserAgentGenerator()
|
||||||
@@ -941,11 +943,24 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await page.wait_for_load_state()
|
try:
|
||||||
|
await page.wait_for_load_state(
|
||||||
|
# state="load",
|
||||||
|
state="domcontentloaded",
|
||||||
|
timeout=5
|
||||||
|
)
|
||||||
|
except PlaywrightTimeoutError:
|
||||||
|
pass
|
||||||
await page.evaluate(update_image_dimensions_js)
|
await page.evaluate(update_image_dimensions_js)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}")
|
self.logger.error(
|
||||||
|
message="Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {error}",
|
||||||
|
tag="ERROR",
|
||||||
|
params={"error": str(e)}
|
||||||
|
)
|
||||||
|
# raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}")
|
||||||
|
|
||||||
# Wait a bit for any onload events to complete
|
# Wait a bit for any onload events to complete
|
||||||
await page.wait_for_timeout(100)
|
await page.wait_for_timeout(100)
|
||||||
|
|||||||
@@ -11,8 +11,9 @@ LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')
|
|||||||
|
|
||||||
class MarkdownGenerationStrategy(ABC):
|
class MarkdownGenerationStrategy(ABC):
|
||||||
"""Abstract base class for markdown generation strategies."""
|
"""Abstract base class for markdown generation strategies."""
|
||||||
def __init__(self, content_filter: Optional[RelevantContentFilter] = None):
|
def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
|
||||||
self.content_filter = content_filter
|
self.content_filter = content_filter
|
||||||
|
self.options = options or {}
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def generate_markdown(self,
|
def generate_markdown(self,
|
||||||
@@ -27,8 +28,8 @@ class MarkdownGenerationStrategy(ABC):
|
|||||||
|
|
||||||
class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
||||||
"""Default implementation of markdown generation strategy."""
|
"""Default implementation of markdown generation strategy."""
|
||||||
def __init__(self, content_filter: Optional[RelevantContentFilter] = None):
|
def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
|
||||||
super().__init__(content_filter)
|
super().__init__(content_filter, options)
|
||||||
|
|
||||||
def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]:
|
def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]:
|
||||||
link_map = {}
|
link_map = {}
|
||||||
@@ -74,6 +75,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
|||||||
cleaned_html: str,
|
cleaned_html: str,
|
||||||
base_url: str = "",
|
base_url: str = "",
|
||||||
html2text_options: Optional[Dict[str, Any]] = None,
|
html2text_options: Optional[Dict[str, Any]] = None,
|
||||||
|
options: Optional[Dict[str, Any]] = None,
|
||||||
content_filter: Optional[RelevantContentFilter] = None,
|
content_filter: Optional[RelevantContentFilter] = None,
|
||||||
citations: bool = True,
|
citations: bool = True,
|
||||||
**kwargs) -> MarkdownGenerationResult:
|
**kwargs) -> MarkdownGenerationResult:
|
||||||
@@ -82,6 +84,10 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
|||||||
h = CustomHTML2Text()
|
h = CustomHTML2Text()
|
||||||
if html2text_options:
|
if html2text_options:
|
||||||
h.update_params(**html2text_options)
|
h.update_params(**html2text_options)
|
||||||
|
elif options:
|
||||||
|
h.update_params(**options)
|
||||||
|
elif self.options:
|
||||||
|
h.update_params(**self.options)
|
||||||
|
|
||||||
# Generate raw markdown
|
# Generate raw markdown
|
||||||
raw_markdown = h.handle(cleaned_html)
|
raw_markdown = h.handle(cleaned_html)
|
||||||
|
|||||||
@@ -236,6 +236,7 @@ class UserAgentGenerator:
|
|||||||
# Example usage:
|
# Example usage:
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
generator = UserAgentGenerator()
|
generator = UserAgentGenerator()
|
||||||
|
print(generator.generate())
|
||||||
|
|
||||||
print("\nSingle browser (Chrome):")
|
print("\nSingle browser (Chrome):")
|
||||||
print(generator.generate(num_browsers=1, browser_type='chrome'))
|
print(generator.generate(num_browsers=1, browser_type='chrome'))
|
||||||
|
|||||||
@@ -547,19 +547,50 @@ async def generate_knowledge_graph():
|
|||||||
f.write(result.extracted_content)
|
f.write(result.extracted_content)
|
||||||
|
|
||||||
async def fit_markdown_remove_overlay():
|
async def fit_markdown_remove_overlay():
|
||||||
async with AsyncWebCrawler(headless = False) as crawler:
|
async with AsyncWebCrawler(
|
||||||
url = "https://janineintheworld.com/places-to-visit-in-central-mexico"
|
headless=True, # Set to False to see what is happening
|
||||||
|
verbose=True,
|
||||||
|
user_agent_mode="random",
|
||||||
|
user_agent_generator_config={
|
||||||
|
"device_type": "mobile",
|
||||||
|
"os_type": "android"
|
||||||
|
},
|
||||||
|
) as crawler:
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url=url,
|
url='https://www.kidocode.com/degrees/technology',
|
||||||
cache_mode=CacheMode.BYPASS,
|
cache_mode=CacheMode.BYPASS,
|
||||||
word_count_threshold = 10,
|
markdown_generator=DefaultMarkdownGenerator(
|
||||||
remove_overlay_elements=True,
|
content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0),
|
||||||
screenshot = True
|
options={
|
||||||
|
"ignore_links": True
|
||||||
|
}
|
||||||
|
),
|
||||||
|
# markdown_generator=DefaultMarkdownGenerator(
|
||||||
|
# content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0),
|
||||||
|
# options={
|
||||||
|
# "ignore_links": True
|
||||||
|
# }
|
||||||
|
# ),
|
||||||
)
|
)
|
||||||
# Save markdown to file
|
|
||||||
with open(os.path.join(__location__, "mexico_places.md"), "w") as f:
|
if result.success:
|
||||||
f.write(result.fit_markdown)
|
print(len(result.markdown_v2.raw_markdown))
|
||||||
|
print(len(result.markdown_v2.markdown_with_citations))
|
||||||
|
print(len(result.markdown_v2.fit_markdown))
|
||||||
|
|
||||||
|
# Save clean html
|
||||||
|
with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f:
|
||||||
|
f.write(result.cleaned_html)
|
||||||
|
|
||||||
|
with open(os.path.join(__location__, "output/output_raw_markdown.md"), "w") as f:
|
||||||
|
f.write(result.markdown_v2.raw_markdown)
|
||||||
|
|
||||||
|
with open(os.path.join(__location__, "output/output_markdown_with_citations.md"), "w") as f:
|
||||||
|
f.write(result.markdown_v2.markdown_with_citations)
|
||||||
|
|
||||||
|
with open(os.path.join(__location__, "output/output_fit_markdown.md"), "w") as f:
|
||||||
|
f.write(result.markdown_v2.fit_markdown)
|
||||||
|
|
||||||
print("Done")
|
print("Done")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user