merge next. Resolve conflicts. Fix some import errors and error handling in server.py
This commit is contained in:
@@ -47,6 +47,7 @@ from .utils import (
|
||||
create_box_message,
|
||||
get_error_context,
|
||||
RobotsParser,
|
||||
preprocess_html_for_schema,
|
||||
)
|
||||
|
||||
|
||||
@@ -111,7 +112,8 @@ class AsyncWebCrawler:
|
||||
self,
|
||||
crawler_strategy: AsyncCrawlerStrategy = None,
|
||||
config: BrowserConfig = None,
|
||||
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
|
||||
base_directory: str = str(
|
||||
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
|
||||
thread_safe: bool = False,
|
||||
logger: AsyncLoggerBase = None,
|
||||
**kwargs,
|
||||
@@ -139,7 +141,8 @@ class AsyncWebCrawler:
|
||||
)
|
||||
|
||||
# Initialize crawler strategy
|
||||
params = {k: v for k, v in kwargs.items() if k in ["browser_config", "logger"]}
|
||||
params = {k: v for k, v in kwargs.items() if k in [
|
||||
"browser_config", "logger"]}
|
||||
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
logger=self.logger,
|
||||
@@ -237,7 +240,8 @@ class AsyncWebCrawler:
|
||||
|
||||
config = config or CrawlerRunConfig()
|
||||
if not isinstance(url, str) or not url:
|
||||
raise ValueError("Invalid URL, make sure the URL is a non-empty string")
|
||||
raise ValueError(
|
||||
"Invalid URL, make sure the URL is a non-empty string")
|
||||
|
||||
async with self._lock or self.nullcontext():
|
||||
try:
|
||||
@@ -291,12 +295,12 @@ class AsyncWebCrawler:
|
||||
|
||||
# Update proxy configuration from rotation strategy if available
|
||||
if config and config.proxy_rotation_strategy:
|
||||
next_proxy : ProxyConfig = await config.proxy_rotation_strategy.get_next_proxy()
|
||||
next_proxy: ProxyConfig = await config.proxy_rotation_strategy.get_next_proxy()
|
||||
if next_proxy:
|
||||
self.logger.info(
|
||||
message="Switch proxy: {proxy}",
|
||||
tag="PROXY",
|
||||
params={"proxy": next_proxy.server}
|
||||
params={"proxy": next_proxy.server}
|
||||
)
|
||||
config.proxy_config = next_proxy
|
||||
# config = config.clone(proxy_config=next_proxy)
|
||||
@@ -306,7 +310,8 @@ class AsyncWebCrawler:
|
||||
t1 = time.perf_counter()
|
||||
|
||||
if config.user_agent:
|
||||
self.crawler_strategy.update_user_agent(config.user_agent)
|
||||
self.crawler_strategy.update_user_agent(
|
||||
config.user_agent)
|
||||
|
||||
# Check robots.txt if enabled
|
||||
if config and config.check_robots_txt:
|
||||
@@ -373,7 +378,8 @@ class AsyncWebCrawler:
|
||||
crawl_result.console_messages = async_response.console_messages
|
||||
|
||||
crawl_result.success = bool(html)
|
||||
crawl_result.session_id = getattr(config, "session_id", None)
|
||||
crawl_result.session_id = getattr(
|
||||
config, "session_id", None)
|
||||
|
||||
self.logger.url_status(
|
||||
url=cache_context.display_url,
|
||||
@@ -396,7 +402,8 @@ class AsyncWebCrawler:
|
||||
tag="COMPLETE"
|
||||
)
|
||||
cached_result.success = bool(html)
|
||||
cached_result.session_id = getattr(config, "session_id", None)
|
||||
cached_result.session_id = getattr(
|
||||
config, "session_id", None)
|
||||
cached_result.redirected_url = cached_result.redirected_url or url
|
||||
return CrawlResultContainer(cached_result)
|
||||
|
||||
@@ -463,12 +470,14 @@ class AsyncWebCrawler:
|
||||
params = config.__dict__.copy()
|
||||
params.pop("url", None)
|
||||
# add keys from kwargs to params that doesn't exist in params
|
||||
params.update({k: v for k, v in kwargs.items() if k not in params.keys()})
|
||||
params.update({k: v for k, v in kwargs.items()
|
||||
if k not in params.keys()})
|
||||
|
||||
################################
|
||||
# Scraping Strategy Execution #
|
||||
################################
|
||||
result: ScrapingResult = scraping_strategy.scrap(url, html, **params)
|
||||
result: ScrapingResult = scraping_strategy.scrap(
|
||||
url, html, **params)
|
||||
|
||||
if result is None:
|
||||
raise ValueError(
|
||||
@@ -484,7 +493,8 @@ class AsyncWebCrawler:
|
||||
|
||||
# Extract results - handle both dict and ScrapingResult
|
||||
if isinstance(result, dict):
|
||||
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
||||
cleaned_html = sanitize_input_encode(
|
||||
result.get("cleaned_html", ""))
|
||||
media = result.get("media", {})
|
||||
links = result.get("links", {})
|
||||
metadata = result.get("metadata", {})
|
||||
@@ -501,14 +511,49 @@ class AsyncWebCrawler:
|
||||
config.markdown_generator or DefaultMarkdownGenerator()
|
||||
)
|
||||
|
||||
# --- SELECT HTML SOURCE BASED ON CONTENT_SOURCE ---
|
||||
# Get the desired source from the generator config, default to 'cleaned_html'
|
||||
selected_html_source = getattr(markdown_generator, 'content_source', 'cleaned_html')
|
||||
|
||||
# Define the source selection logic using dict dispatch
|
||||
html_source_selector = {
|
||||
"raw_html": lambda: html, # The original raw HTML
|
||||
"cleaned_html": lambda: cleaned_html, # The HTML after scraping strategy
|
||||
"fit_html": lambda: preprocess_html_for_schema(html_content=html), # Preprocessed raw HTML
|
||||
}
|
||||
|
||||
markdown_input_html = cleaned_html # Default to cleaned_html
|
||||
|
||||
try:
|
||||
# Get the appropriate lambda function, default to returning cleaned_html if key not found
|
||||
source_lambda = html_source_selector.get(selected_html_source, lambda: cleaned_html)
|
||||
# Execute the lambda to get the selected HTML
|
||||
markdown_input_html = source_lambda()
|
||||
|
||||
# Log which source is being used (optional, but helpful for debugging)
|
||||
# if self.logger and verbose:
|
||||
# actual_source_used = selected_html_source if selected_html_source in html_source_selector else 'cleaned_html (default)'
|
||||
# self.logger.debug(f"Using '{actual_source_used}' as source for Markdown generation for {url}", tag="MARKDOWN_SRC")
|
||||
|
||||
except Exception as e:
|
||||
# Handle potential errors, especially from preprocess_html_for_schema
|
||||
if self.logger:
|
||||
self.logger.warning(
|
||||
f"Error getting/processing '{selected_html_source}' for markdown source: {e}. Falling back to cleaned_html.",
|
||||
tag="MARKDOWN_SRC"
|
||||
)
|
||||
# Ensure markdown_input_html is still the default cleaned_html in case of error
|
||||
markdown_input_html = cleaned_html
|
||||
# --- END: HTML SOURCE SELECTION ---
|
||||
|
||||
# Uncomment if by default we want to use PruningContentFilter
|
||||
# if not config.content_filter and not markdown_generator.content_filter:
|
||||
# markdown_generator.content_filter = PruningContentFilter()
|
||||
|
||||
markdown_result: MarkdownGenerationResult = (
|
||||
markdown_generator.generate_markdown(
|
||||
cleaned_html=cleaned_html,
|
||||
base_url=params.get("redirected_url", url),
|
||||
input_html=markdown_input_html,
|
||||
base_url=params.get("redirected_url", url)
|
||||
# html2text_options=kwargs.get('html2text', {})
|
||||
)
|
||||
)
|
||||
|
||||
@@ -31,22 +31,24 @@ class MarkdownGenerationStrategy(ABC):
|
||||
content_filter: Optional[RelevantContentFilter] = None,
|
||||
options: Optional[Dict[str, Any]] = None,
|
||||
verbose: bool = False,
|
||||
content_source: str = "cleaned_html",
|
||||
):
|
||||
self.content_filter = content_filter
|
||||
self.options = options or {}
|
||||
self.verbose = verbose
|
||||
self.content_source = content_source
|
||||
|
||||
@abstractmethod
|
||||
def generate_markdown(
|
||||
self,
|
||||
cleaned_html: str,
|
||||
input_html: str,
|
||||
base_url: str = "",
|
||||
html2text_options: Optional[Dict[str, Any]] = None,
|
||||
content_filter: Optional[RelevantContentFilter] = None,
|
||||
citations: bool = True,
|
||||
**kwargs,
|
||||
) -> MarkdownGenerationResult:
|
||||
"""Generate markdown from cleaned HTML."""
|
||||
"""Generate markdown from the selected input HTML."""
|
||||
pass
|
||||
|
||||
|
||||
@@ -63,6 +65,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
||||
Args:
|
||||
content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
|
||||
options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
|
||||
content_source (str): Source of content to generate markdown from. Options: "cleaned_html", "raw_html", "fit_html". Defaults to "cleaned_html".
|
||||
|
||||
Returns:
|
||||
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
|
||||
@@ -72,8 +75,9 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
||||
self,
|
||||
content_filter: Optional[RelevantContentFilter] = None,
|
||||
options: Optional[Dict[str, Any]] = None,
|
||||
content_source: str = "cleaned_html",
|
||||
):
|
||||
super().__init__(content_filter, options)
|
||||
super().__init__(content_filter, options, verbose=False, content_source=content_source)
|
||||
|
||||
def convert_links_to_citations(
|
||||
self, markdown: str, base_url: str = ""
|
||||
@@ -143,7 +147,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
||||
|
||||
def generate_markdown(
|
||||
self,
|
||||
cleaned_html: str,
|
||||
input_html: str,
|
||||
base_url: str = "",
|
||||
html2text_options: Optional[Dict[str, Any]] = None,
|
||||
options: Optional[Dict[str, Any]] = None,
|
||||
@@ -152,16 +156,16 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
||||
**kwargs,
|
||||
) -> MarkdownGenerationResult:
|
||||
"""
|
||||
Generate markdown with citations from cleaned HTML.
|
||||
Generate markdown with citations from the provided input HTML.
|
||||
|
||||
How it works:
|
||||
1. Generate raw markdown from cleaned HTML.
|
||||
1. Generate raw markdown from the input HTML.
|
||||
2. Convert links to citations.
|
||||
3. Generate fit markdown if content filter is provided.
|
||||
4. Return MarkdownGenerationResult.
|
||||
|
||||
Args:
|
||||
cleaned_html (str): Cleaned HTML content.
|
||||
input_html (str): The HTML content to process (selected based on content_source).
|
||||
base_url (str): Base URL for URL joins.
|
||||
html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
|
||||
options (Optional[Dict[str, Any]]): Additional options for markdown generation.
|
||||
@@ -196,14 +200,14 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
||||
h.update_params(**default_options)
|
||||
|
||||
# Ensure we have valid input
|
||||
if not cleaned_html:
|
||||
cleaned_html = ""
|
||||
elif not isinstance(cleaned_html, str):
|
||||
cleaned_html = str(cleaned_html)
|
||||
if not input_html:
|
||||
input_html = ""
|
||||
elif not isinstance(input_html, str):
|
||||
input_html = str(input_html)
|
||||
|
||||
# Generate raw markdown
|
||||
try:
|
||||
raw_markdown = h.handle(cleaned_html)
|
||||
raw_markdown = h.handle(input_html)
|
||||
except Exception as e:
|
||||
raw_markdown = f"Error converting HTML to markdown: {str(e)}"
|
||||
|
||||
@@ -228,7 +232,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
||||
if content_filter or self.content_filter:
|
||||
try:
|
||||
content_filter = content_filter or self.content_filter
|
||||
filtered_html = content_filter.filter_content(cleaned_html)
|
||||
filtered_html = content_filter.filter_content(input_html)
|
||||
filtered_html = "\n".join(
|
||||
"<div>{}</div>".format(s) for s in filtered_html
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user