Merge branch '2025-MAY-2' into next-MAY

2025-07-08 11:46:13 +02:00
parent 1a73fb60db 414f16e975
commit 0f210f6e02
28 changed files with 448 additions and 154 deletions
--- a/README.md
+++ b/README.md
@@ -291,11 +291,19 @@ import requests
 # Submit a crawl job
 response = requests.post(
    "http://localhost:11235/crawl",
-    json={"urls": "https://example.com", "priority": 10}
+    json={"urls": ["https://example.com"], "priority": 10}
 )
-task_id = response.json()["task_id"]
+if response.status_code == 200:
    print("Crawl job submitted successfully.")
-# Continue polling until the task is complete (status="completed")
+if "results" in response.json():
    results = response.json()["results"]
    print("Crawl job completed. Results:")
    for result in results:
        print(result)
 else:
    task_id = response.json()["task_id"]
    print(f"Crawl job submitted. Task ID:: {task_id}")
    result = requests.get(f"http://localhost:11235/task/{task_id}")
 ```
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -445,6 +445,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            return await self._crawl_web(url, config)
        elif url.startswith("file://"):
            # initialize empty lists for console messages
            captured_console = []
            # Process local file
            local_file_path = url[7:]  # Remove 'file://' prefix
            if not os.path.exists(local_file_path):
@@ -741,18 +744,49 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                    )
                    redirected_url = page.url
                except Error as e:
                    # Allow navigation to be aborted when downloading files
                    # This is expected behavior for downloads in some browser engines
                    if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads:
                        self.logger.info(
                            message=f"Navigation aborted, likely due to file download: {url}",
                            tag="GOTO",
                            params={"url": url},
                        )
                        response = None
                    else:
                        raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
                await self.execute_hook(
                    "after_goto", page, context=context, url=url, response=response, config=config
                )
                # ──────────────────────────────────────────────────────────────
                # Walk the redirect chain.  Playwright returns only the last
                # hop, so we trace the `request.redirected_from` links until the
                # first response that differs from the final one and surface its
                # status-code.
                # ──────────────────────────────────────────────────────────────
                if response is None:
                    status_code = 200
                    response_headers = {}
                else:
-                    status_code = response.status
+                    first_resp = response
-                    response_headers = response.headers
+                    req = response.request
                    while req and req.redirected_from:
                        prev_req = req.redirected_from
                        prev_resp = await prev_req.response()
                        if prev_resp:                       # keep earliest
                            first_resp = prev_resp
                        req = prev_req
                    status_code = first_resp.status
                    response_headers = first_resp.headers
                # if response is None:
                #     status_code = 200
                #     response_headers = {}
                # else:
                #     status_code = response.status
                #     response_headers = response.headers
            else:
                status_code = 200
@@ -1616,12 +1650,32 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            num_segments = (page_height // viewport_height) + 1
            for i in range(num_segments):
                y_offset = i * viewport_height
                # Special handling for the last segment
                if i == num_segments - 1:
                    last_part_height = page_height % viewport_height
                    # If page_height is an exact multiple of viewport_height,
                    # we don't need an extra segment
                    if last_part_height == 0:
                        # Skip last segment if page height is exact multiple of viewport
                        break
                    # Adjust viewport to exactly match the remaining content height
                    await page.set_viewport_size({"width": page_width, "height": last_part_height})
                await page.evaluate(f"window.scrollTo(0, {y_offset})")
                await asyncio.sleep(0.01)  # wait for render
-                seg_shot = await page.screenshot(full_page=False)
+                
                # Capture the current segment
                # Note: Using compression options (format, quality) would go here
                seg_shot = await page.screenshot(full_page=False, type="jpeg", quality=85)
                # seg_shot = await page.screenshot(full_page=False)
                img = Image.open(BytesIO(seg_shot)).convert("RGB")
                segments.append(img)
            # Reset viewport to original size after capturing segments
            await page.set_viewport_size({"width": page_width, "height": viewport_height})
            total_height = sum(img.height for img in segments)
            stitched = Image.new("RGB", (segments[0].width, total_height))
            offset = 0
--- a/crawl4ai/async_logger.py
+++ b/crawl4ai/async_logger.py
@@ -39,6 +39,7 @@ class LogColor(str, Enum):
    YELLOW = "yellow"
    MAGENTA = "magenta"
    DIM_MAGENTA = "dim magenta"
    RED = "red"
    def __str__(self):
        """Automatically convert rich color to string."""
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -588,10 +588,12 @@ class AsyncWebCrawler:
            # Choose content based on input_format
            content_format = config.extraction_strategy.input_format
            if content_format == "fit_markdown" and not markdown_result.fit_markdown:
-                self.logger.warning(
+
-                    message="Fit markdown requested but not available. Falling back to raw markdown.",
+                self.logger.url_status(
                        url=_url,
                        success=bool(html),
                        timing=time.perf_counter() - t1,
                        tag="EXTRACT",
                    params={"url": _url},
                    )
                content_format = "markdown"
@@ -616,10 +618,11 @@ class AsyncWebCrawler:
            )
            # Log extraction completion
-            self.logger.info(
+            self.logger.url_status(
-                message="Completed for {url:.50}... | Time: {timing}s",
+                        url=_url,
                        success=bool(html),
                        timing=time.perf_counter() - t1,
                        tag="EXTRACT",
                params={"url": _url, "timing": time.perf_counter() - t1},
                    )
        # Apply HTML formatting if requested
--- a/crawl4ai/browser_profiler.py
+++ b/crawl4ai/browser_profiler.py
@@ -480,7 +480,7 @@ class BrowserProfiler:
                self.logger.info("4. Exit", tag="MENU", base_color=LogColor.MAGENTA)
                exit_option = "4"
-            self.logger.print(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="")
+            self.logger.info(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="")
            choice = input()
            if choice == "1":
@@ -637,9 +637,18 @@ class BrowserProfiler:
        self.logger.info(f"Debugging port: {debugging_port}", tag="CDP")
        self.logger.info(f"Headless mode: {headless}", tag="CDP")
        # create browser config
        browser_config = BrowserConfig(
            browser_type=browser_type,
            headless=headless,
            user_data_dir=profile_path,
            debugging_port=debugging_port,
            verbose=True
        )
        # Create managed browser instance
        managed_browser = ManagedBrowser(
-            browser_type=browser_type,
+            browser_config=browser_config,
            user_data_dir=profile_path,
            headless=headless,
            logger=self.logger,
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -1010,7 +1010,7 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
@click.option("--output-file", "-O", type=click.Path(), help="Output file path (default: stdout)")
-@click.option("--bypass-cache", "-b", is_flag=True, default=True, help="Bypass cache when crawling")
+@click.option("--bypass-cache", "-bc", is_flag=True, default=True, help="Bypass cache when crawling")
@click.option("--question", "-q", help="Ask a question about the crawled content")
@click.option("--verbose", "-v", is_flag=True)
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -720,13 +720,18 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                    # Check flag if we should remove external images
                    if kwargs.get("exclude_external_images", False):
                        # Handle relative URLs (which are always from the same domain)
                        if not src.startswith('http') and not src.startswith('//'):
                            return True  # Keep relative URLs
                        # For absolute URLs, compare the base domains using the existing function
                        src_base_domain = get_base_domain(src)
                        url_base_domain = get_base_domain(url)
                        # If the domains don't match and both are valid, the image is external
                        if src_base_domain and url_base_domain and src_base_domain != url_base_domain:
                            element.decompose()
                            return False
                        # src_url_base = src.split('/')[2]
                        # url_base = url.split('/')[2]
                        # if url_base not in src_url_base:
                        #     element.decompose()
                        #     return False
                    # if kwargs.get('exclude_social_media_links', False):
                    #     if image_src_base_domain in exclude_social_media_domains:
--- a/crawl4ai/deep_crawling/bff_strategy.py
+++ b/crawl4ai/deep_crawling/bff_strategy.py
@@ -150,6 +150,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
                self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
                break
            # Calculate how many more URLs we can process in this batch
            remaining = self.max_pages - self._pages_crawled
            batch_size = min(BATCH_SIZE, remaining)
            if batch_size <= 0:
                # No more pages to crawl
                self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
                break
            batch: List[Tuple[float, int, str, Optional[str]]] = []
            # Retrieve up to BATCH_SIZE items from the priority queue.
            for _ in range(BATCH_SIZE):
@@ -184,6 +192,10 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
                # Count only successful crawls toward max_pages limit
                if result.success:
                    self._pages_crawled += 1
                    # Check if we've reached the limit during batch processing
                    if self._pages_crawled >= self.max_pages:
                        self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
                        break  # Exit the generator
                yield result
--- a/crawl4ai/deep_crawling/bfs_strategy.py
+++ b/crawl4ai/deep_crawling/bfs_strategy.py
@@ -157,6 +157,11 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
        results: List[CrawlResult] = []
        while current_level and not self._cancel_event.is_set():
            # Check if we've already reached max_pages before starting a new level
            if self._pages_crawled >= self.max_pages:
                self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
                break
            next_level: List[Tuple[str, Optional[str]]] = []
            urls = [url for url, _ in current_level]
@@ -221,6 +226,10 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
                # Count only successful crawls
                if result.success:
                    self._pages_crawled += 1
                    # Check if we've reached the limit during batch processing
                    if self._pages_crawled >= self.max_pages:
                        self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
                        break  # Exit the generator
                results_count += 1
                yield result
--- a/crawl4ai/deep_crawling/dfs_strategy.py
+++ b/crawl4ai/deep_crawling/dfs_strategy.py
@@ -49,6 +49,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
                # Count only successful crawls toward max_pages limit
                if result.success:
                    self._pages_crawled += 1
                    # Check if we've reached the limit during batch processing
                    if self._pages_crawled >= self.max_pages:
                        self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
                        break  # Exit the generator
                    # Only discover links from successful crawls
                    new_links: List[Tuple[str, Optional[str]]] = []
@@ -94,6 +98,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
                # and only discover links from successful crawls
                if result.success:
                    self._pages_crawled += 1
                    # Check if we've reached the limit during batch processing
                    if self._pages_crawled >= self.max_pages:
                        self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
                        break  # Exit the generator
                    new_links: List[Tuple[str, Optional[str]]] = []
                    await self.link_discovery(result, url, depth, visited, new_links, depths)
--- a/crawl4ai/docker_client.py
+++ b/crawl4ai/docker_client.py
@@ -73,6 +73,8 @@ class Crawl4aiDockerClient:
    def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None, 
                       crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]:
        """Prepare request data from configs."""
        if self._token:
            self._http_client.headers["Authorization"] = f"Bearer {self._token}"
        return {
            "urls": urls,
            "browser_config": browser_config.dump() if browser_config else {},
@@ -103,8 +105,6 @@ class Crawl4aiDockerClient:
        crawler_config: Optional[CrawlerRunConfig] = None
    ) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
        """Execute a crawl operation."""
        if not self._token:
            raise Crawl4aiClientError("Authentication required. Call authenticate() first.")
        await self._check_server()
        data = self._prepare_request(urls, browser_config, crawler_config)
@@ -140,8 +140,6 @@ class Crawl4aiDockerClient:
    async def get_schema(self) -> Dict[str, Any]:
        """Retrieve configuration schemas."""
        if not self._token:
            raise Crawl4aiClientError("Authentication required. Call authenticate() first.")
        response = await self._request("GET", "/schema")
        return response.json()
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -656,11 +656,11 @@ class LLMExtractionStrategy(ExtractionStrategy):
            self.total_usage.total_tokens += usage.total_tokens
            try:
-                response = response.choices[0].message.content
+                content = response.choices[0].message.content
                blocks = None
                if self.force_json_response:
-                    blocks = json.loads(response)
+                    blocks = json.loads(content)
                    if isinstance(blocks, dict):
                        # If it has only one key which calue is list then assign that to blocks, exampled: {"news": [..]}
                        if len(blocks) == 1 and isinstance(list(blocks.values())[0], list):
@@ -673,7 +673,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
                        blocks = blocks
                else: 
                    # blocks = extract_xml_data(["blocks"], response.choices[0].message.content)["blocks"]
-                    blocks = extract_xml_data(["blocks"], response)["blocks"]
+                    blocks = extract_xml_data(["blocks"], content)["blocks"]
                    blocks = json.loads(blocks)
                for block in blocks:
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -50,6 +50,29 @@ from urllib.parse import (
 )
 # Monkey patch to fix wildcard handling in urllib.robotparser
 from urllib.robotparser import RuleLine
 import re
 original_applies_to = RuleLine.applies_to
 def patched_applies_to(self, filename):
   # Handle wildcards in paths
   if '*' in self.path or '%2A' in self.path or self.path in ("*", "%2A"):
       pattern = self.path.replace('%2A', '*')
       pattern = re.escape(pattern).replace('\\*', '.*')
       pattern = '^' + pattern
       if pattern.endswith('\\$'):
           pattern = pattern[:-2] + '$'
       try:
           return bool(re.match(pattern, filename))
       except re.error:
           return original_applies_to(self, filename)
   return original_applies_to(self, filename)
 RuleLine.applies_to = patched_applies_to
 # Monkey patch ends
 def chunk_documents(
    documents: Iterable[str],
    chunk_token_threshold: int,
@@ -318,7 +341,7 @@ class RobotsParser:
                robots_url = f"{scheme}://{domain}/robots.txt"
                async with aiohttp.ClientSession() as session:
-                    async with session.get(robots_url, timeout=2) as response:
+                    async with session.get(robots_url, timeout=2, ssl=False) as response:
                        if response.status == 200:
                            rules = await response.text()
                            self._cache_rules(domain, rules)
@@ -1524,6 +1547,13 @@ def extract_metadata_using_lxml(html, doc=None):
        content = tag.get("content", "").strip()
        if property_name and content:
            metadata[property_name] = content
   # Article metadata - using starts-with() for performance
    article_tags = head.xpath('.//meta[starts-with(@property, "article:")]')
    for tag in article_tags:
        property_name = tag.get("property", "").strip()
        content = tag.get("content", "").strip()
        if property_name and content:
            metadata[property_name] = content
    return metadata
@@ -1599,7 +1629,12 @@ def extract_metadata(html, soup=None):
        content = tag.get("content", "").strip()
        if property_name and content:
            metadata[property_name] = content
-
+        # getting the article Values
    metadata.update({
        tag['property'].strip():tag["content"].strip()
        for tag in head.find_all("meta", attrs={"property": re.compile(r"^article:")})
          if tag.has_attr('property') and tag.has_attr('content')
    })
    return metadata
@@ -2069,13 +2104,15 @@ def normalize_url(href, base_url):
    if not parsed_base.scheme or not parsed_base.netloc:
        raise ValueError(f"Invalid base URL format: {base_url}")
-    # Ensure base_url ends with a trailing slash if it's a directory path
+    if  parsed_base.scheme.lower() not in ["http", "https"]:
-    if not base_url.endswith('/'):
+        # Handle special protocols
-        base_url = base_url + '/'
+        raise ValueError(f"Invalid base URL format: {base_url}")
    cleaned_href = href.strip()
    # Use urljoin to handle all cases
-    normalized = urljoin(base_url, href.strip())
+    return urljoin(base_url, cleaned_href)
-    return normalized
+
 def normalize_url(
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -459,7 +459,7 @@ async def handle_crawl_request(
            #      await crawler.close()
            #  except Exception as close_e:
            #       logger.error(f"Error closing crawler during exception handling: {close_e}")
-            logger.error(f"Error closing crawler during exception handling: {close_e}")
+            logger.error(f"Error closing crawler during exception handling: {str(e)}")
        # Measure memory even on error if possible
        end_mem_mb_error = _get_memory_mb()
@@ -518,7 +518,7 @@ async def handle_stream_crawl_request(
            #       await crawler.close()
            #  except Exception as close_e:
            #       logger.error(f"Error closing crawler during stream setup exception: {close_e}")
-            logger.error(f"Error closing crawler during stream setup exception: {close_e}")
+            logger.error(f"Error closing crawler during stream setup exception: {str(e)}")
        logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
        # Raising HTTPException here will prevent streaming response
        raise HTTPException(
--- a/deploy/docker/c4ai-doc-context.md
+++ b/deploy/docker/c4ai-doc-context.md
@@ -403,7 +403,7 @@ async def main():
    md_generator = DefaultMarkdownGenerator(
    content_filter=filter,
-    options={"ignore_links": True}
+    options={"ignore_links": True})
    # 4) Crawler run config: skip cache, use extraction
    run_conf = CrawlerRunConfig(
@@ -3760,11 +3760,11 @@ To crawl a live web page, provide the URL starting with `http://` or `https://`,
 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig
 async def crawl_web():
-    config = CrawlerRunConfig(bypass_cache=True)
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="https://en.wikipedia.org/wiki/apple", 
@@ -3785,13 +3785,13 @@ To crawl a local HTML file, prefix the file path with `file://`.
 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig
 async def crawl_local_file():
    local_file_path = "/path/to/apple.html"  # Replace with your file path
    file_url = f"file://{local_file_path}"
-    config = CrawlerRunConfig(bypass_cache=True)
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(url=file_url, config=config)
@@ -3810,13 +3810,13 @@ To crawl raw HTML content, prefix the HTML string with `raw:`.
 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig
 async def crawl_raw_html():
    raw_html = "<html><body><h1>Hello, World!</h1></body></html>"
    raw_html_url = f"raw:{raw_html}"
-    config = CrawlerRunConfig(bypass_cache=True)
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(url=raw_html_url, config=config)
@@ -3845,7 +3845,7 @@ import os
 import sys
 import asyncio
 from pathlib import Path
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig
 async def main():
@@ -3856,7 +3856,7 @@ async def main():
    async with AsyncWebCrawler() as crawler:
        # Step 1: Crawl the Web URL
        print("\n=== Step 1: Crawling the Wikipedia URL ===")
-        web_config = CrawlerRunConfig(bypass_cache=True)
+        web_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
        result = await crawler.arun(url=wikipedia_url, config=web_config)
        if not result.success:
@@ -3871,7 +3871,7 @@ async def main():
        # Step 2: Crawl from the Local HTML File
        print("=== Step 2: Crawling from the Local HTML File ===")
        file_url = f"file://{html_file_path.resolve()}"
-        file_config = CrawlerRunConfig(bypass_cache=True)
+        file_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
        local_result = await crawler.arun(url=file_url, config=file_config)
        if not local_result.success:
@@ -3887,7 +3887,7 @@ async def main():
        with open(html_file_path, 'r', encoding='utf-8') as f:
            raw_html_content = f.read()
        raw_html_url = f"raw:{raw_html_content}"
-        raw_config = CrawlerRunConfig(bypass_cache=True)
+        raw_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
        raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
        if not raw_result.success:
@@ -4152,7 +4152,7 @@ prune_filter = PruningContentFilter(
 For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
 ```python
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
 from crawl4ai.content_filter_strategy import LLMContentFilter
 async def main():
@@ -4175,8 +4175,13 @@ async def main():
        verbose=True
    )
    md_generator = DefaultMarkdownGenerator(
        content_filter=filter,
        options={"ignore_links": True}
    )
    config = CrawlerRunConfig(
-        content_filter=filter
+        markdown_generator=md_generator
    )
    async with AsyncWebCrawler() as crawler:
@@ -5428,29 +5433,38 @@ Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI c
 ```python
 import os, asyncio
 from base64 import b64decode
-from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
 async def main():
    run_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        screenshot=True,
        pdf=True
    )
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
-            cache_mode=CacheMode.BYPASS,
+            config=run_config
            pdf=True,
            screenshot=True
        )
        if result.success:
-            # Save screenshot
+            print(f"Screenshot data present: {result.screenshot is not None}")
            print(f"PDF data present: {result.pdf is not None}")
            if result.screenshot:
                print(f"[OK] Screenshot captured, size: {len(result.screenshot)} bytes")
                with open("wikipedia_screenshot.png", "wb") as f:
                    f.write(b64decode(result.screenshot))
            else:
                print("[WARN] Screenshot data is None.")
            # Save PDF
            if result.pdf:
                print(f"[OK] PDF captured, size: {len(result.pdf)} bytes")
                with open("wikipedia_page.pdf", "wb") as f:
                    f.write(result.pdf)
            else:
                print("[WARN] PDF data is None.")
            print("[OK] PDF & screenshot captured.")
        else:
            print("[ERROR]", result.error_message)
--- a/deploy/docker/schemas.py
+++ b/deploy/docker/schemas.py
@@ -12,8 +12,7 @@ class CrawlRequest(BaseModel):
 class MarkdownRequest(BaseModel):
    """Request body for the /md endpoint."""
    url: str                    = Field(...,  description="Absolute http/https URL to fetch")
-    f:   FilterType             = Field(FilterType.FIT,
+    f:   FilterType             = Field(FilterType.FIT, description="Content‑filter strategy: fit, raw, bm25, or llm")
                                        description="Content‑filter strategy: FIT, RAW, BM25, or LLM")
    q:   Optional[str] = Field(None,  description="Query string used by BM25/LLM filters")
    c:   Optional[str] = Field("0",   description="Cache‑bust / revision counter")
--- a/docs/md_v2/advanced/advanced-features.md
+++ b/docs/md_v2/advanced/advanced-features.md
@@ -66,29 +66,38 @@ Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI c
 ```python
 import os, asyncio
 from base64 import b64decode
-from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
 async def main():
    run_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        screenshot=True,
        pdf=True
    )
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
-            cache_mode=CacheMode.BYPASS,
+            config=run_config
            pdf=True,
            screenshot=True
        )
        if result.success:
-            # Save screenshot
+            print(f"Screenshot data present: {result.screenshot is not None}")
            print(f"PDF data present: {result.pdf is not None}")
            if result.screenshot:
                print(f"[OK] Screenshot captured, size: {len(result.screenshot)} bytes")
                with open("wikipedia_screenshot.png", "wb") as f:
                    f.write(b64decode(result.screenshot))
            else:
                print("[WARN] Screenshot data is None.")
            # Save PDF
            if result.pdf:
                print(f"[OK] PDF captured, size: {len(result.pdf)} bytes")
                with open("wikipedia_page.pdf", "wb") as f:
                    f.write(result.pdf)
            else:
                print("[WARN] PDF data is None.")
            print("[OK] PDF & screenshot captured.")
        else:
            print("[ERROR]", result.error_message)
--- a/docs/md_v2/advanced/proxy-security.md
+++ b/docs/md_v2/advanced/proxy-security.md
@@ -25,44 +25,70 @@ Use an authenticated proxy with `BrowserConfig`:
 ```python
 from crawl4ai.async_configs import BrowserConfig
-proxy_config = {
+browser_config = BrowserConfig(proxy="http://[username]:[password]@[host]:[port]")
    "server": "http://proxy.example.com:8080",
    "username": "user",
    "password": "pass"
 }
 browser_config = BrowserConfig(proxy_config=proxy_config)
 async with AsyncWebCrawler(config=browser_config) as crawler:
    result = await crawler.arun(url="https://example.com")
 ```
 Here's the corrected documentation:
 ## Rotating Proxies 
 Example using a proxy rotation service dynamically:
 ```python
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+import re
-
+from crawl4ai import (
-async def get_next_proxy():
+    AsyncWebCrawler,
-    # Your proxy rotation logic here
+    BrowserConfig,
-    return {"server": "http://next.proxy.com:8080"}
+    CrawlerRunConfig,
-
+    CacheMode,
    RoundRobinProxyStrategy,
 )
 import asyncio
 from crawl4ai import ProxyConfig
 async def main():
-    browser_config = BrowserConfig()
+    # Load proxies and create rotation strategy
-    run_config = CrawlerRunConfig()
+    proxies = ProxyConfig.from_env()
    #eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2"
    if not proxies:
        print("No proxies found in environment. Set PROXIES env variable!")
        return
    proxy_strategy = RoundRobinProxyStrategy(proxies)
    # Create configs
    browser_config = BrowserConfig(headless=True, verbose=False)
    run_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        proxy_rotation_strategy=proxy_strategy
    )
    async with AsyncWebCrawler(config=browser_config) as crawler:
-        # For each URL, create a new run config with different proxy
+        urls = ["https://httpbin.org/ip"] * (len(proxies) * 2)  # Test each proxy twice
-        for url in urls:
+
-            proxy = await get_next_proxy()
+        print("\n📈 Initializing crawler with proxy rotation...")
-            # Clone the config and update proxy - this creates a new browser context
+        async with AsyncWebCrawler(config=browser_config) as crawler:
-            current_config = run_config.clone(proxy_config=proxy)
+            print("\n🚀 Starting batch crawl with proxy rotation...")
-            result = await crawler.arun(url=url, config=current_config)
+            results = await crawler.arun_many(
                urls=urls,
                config=run_config
            )
            for result in results:
                if result.success:
                    ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
                    current_proxy = run_config.proxy_config if run_config.proxy_config else None
                    if current_proxy and ip_match:
                        print(f"URL {result.url}")
                        print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}")
                        verified = ip_match.group(0) == current_proxy.ip
                        if verified:
                            print(f"✅ Proxy working! IP matches: {current_proxy.ip}")
                        else:
                            print("❌ Proxy failed or IP mismatch!")
                    print("---")
 if __name__ == "__main__":
    import asyncio
 asyncio.run(main())
 ```
--- a/docs/md_v2/core/browser-crawler-config.md
+++ b/docs/md_v2/core/browser-crawler-config.md
@@ -273,7 +273,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi
 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig, LLMContentFilter, DefaultMarkdownGenerator
 from crawl4ai import JsonCssExtractionStrategy
 async def main():
@@ -298,7 +298,7 @@ async def main():
    # 3) Example LLM content filtering
    gemini_config = LLMConfig(
-        provider="gemini/gemini-1.5-pro" 
+        provider="gemini/gemini-1.5-pro", 
        api_token = "env:GEMINI_API_TOKEN"
    )
@@ -324,6 +324,7 @@ async def main():
    md_generator = DefaultMarkdownGenerator(
        content_filter=filter,
        options={"ignore_links": True}
    )
    # 4) Crawler run config: skip cache, use extraction
    run_conf = CrawlerRunConfig(
--- a/docs/md_v2/core/cli.md
+++ b/docs/md_v2/core/cli.md
@@ -17,6 +17,9 @@
 - [Configuration Reference](#configuration-reference)
 - [Best Practices & Tips](#best-practices--tips)
 ## Installation
 The Crawl4AI CLI will be installed automatically when you install the library.
 ## Basic Usage
 The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library:
--- a/docs/md_v2/core/local-files.md
+++ b/docs/md_v2/core/local-files.md
@@ -8,11 +8,11 @@ To crawl a live web page, provide the URL starting with `http://` or `https://`,
 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig
 async def crawl_web():
-    config = CrawlerRunConfig(bypass_cache=True)
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="https://en.wikipedia.org/wiki/apple", 
@@ -33,13 +33,13 @@ To crawl a local HTML file, prefix the file path with `file://`.
 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig
 async def crawl_local_file():
    local_file_path = "/path/to/apple.html"  # Replace with your file path
    file_url = f"file://{local_file_path}"
-    config = CrawlerRunConfig(bypass_cache=True)
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(url=file_url, config=config)
@@ -93,7 +93,7 @@ import os
 import sys
 import asyncio
 from pathlib import Path
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig
 async def main():
@@ -104,7 +104,7 @@ async def main():
    async with AsyncWebCrawler() as crawler:
        # Step 1: Crawl the Web URL
        print("\n=== Step 1: Crawling the Wikipedia URL ===")
-        web_config = CrawlerRunConfig(bypass_cache=True)
+        web_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
        result = await crawler.arun(url=wikipedia_url, config=web_config)
        if not result.success:
@@ -119,7 +119,7 @@ async def main():
        # Step 2: Crawl from the Local HTML File
        print("=== Step 2: Crawling from the Local HTML File ===")
        file_url = f"file://{html_file_path.resolve()}"
-        file_config = CrawlerRunConfig(bypass_cache=True)
+        file_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
        local_result = await crawler.arun(url=file_url, config=file_config)
        if not local_result.success:
@@ -135,7 +135,7 @@ async def main():
        with open(html_file_path, 'r', encoding='utf-8') as f:
            raw_html_content = f.read()
        raw_html_url = f"raw:{raw_html_content}"
-        raw_config = CrawlerRunConfig(bypass_cache=True)
+        raw_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
        raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
        if not raw_result.success:
--- a/docs/md_v2/core/markdown-generation.md
+++ b/docs/md_v2/core/markdown-generation.md
@@ -201,6 +201,7 @@ config = CrawlerRunConfig(markdown_generator=md_generator)
 - **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query.  
 - **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more.  
 - **`use_stemming`** *(default `True`)*: If enabled, variations of words match (e.g., “learn,” “learning,” “learnt”).
 - **`language (str)`**: Language for stemming (default: 'english').
 **No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.
@@ -233,7 +234,7 @@ prune_filter = PruningContentFilter(
 For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
 ```python
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
 from crawl4ai.content_filter_strategy import LLMContentFilter
 async def main():
@@ -255,9 +256,12 @@ async def main():
        chunk_token_threshold=4096,  # Adjust based on your needs
        verbose=True
    )
-
+    md_generator = DefaultMarkdownGenerator(
        content_filter=filter,
        options={"ignore_links": True}
    )
    config = CrawlerRunConfig(
-        content_filter=filter
+        markdown_generator=md_generator,
    )
    async with AsyncWebCrawler() as crawler:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ dependencies = [
    "lxml~=5.3",
    "litellm>=1.53.1",
    "numpy>=1.26.0,<3",
-    "pillow~=10.4",
+    "pillow>=10.4",
    "playwright>=1.49.0",
    "python-dotenv~=1.0",
    "requests~=2.26",
@@ -32,7 +32,6 @@ dependencies = [
    "psutil>=6.1.1",
    "nltk>=3.9.1",
    "playwright",
    "aiofiles",
    "rich>=13.9.4",
    "cssselect>=1.2.0",
    "httpx>=0.27.2",
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ aiosqlite~=0.20
 lxml~=5.3
 litellm>=1.53.1
 numpy>=1.26.0,<3
-pillow~=10.4
+pillow>=10.4
 playwright>=1.49.0
 python-dotenv~=1.0
 requests~=2.26
@@ -27,3 +27,7 @@ httpx[http2]>=0.27.2
 sentence-transformers>=2.2.0
 alphashape>=1.3.1
 shapely>=2.0.0
 fake-useragent>=2.2.0
 pdf2image>=1.17.0
 PyPDF2>=3.0.1
--- a/tests/docker_example.py
+++ b/tests/docker_example.py
@@ -105,7 +105,7 @@ def test_docker_deployment(version="basic"):
 def test_basic_crawl(tester: Crawl4AiTester):
    print("\n=== Testing Basic Crawl ===")
    request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
        "priority": 10,
        "session_id": "test",
    }
@@ -119,7 +119,7 @@ def test_basic_crawl(tester: Crawl4AiTester):
 def test_basic_crawl_sync(tester: Crawl4AiTester):
    print("\n=== Testing Basic Crawl (Sync) ===")
    request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
        "priority": 10,
        "session_id": "test",
    }
@@ -134,7 +134,7 @@ def test_basic_crawl_sync(tester: Crawl4AiTester):
 def test_js_execution(tester: Crawl4AiTester):
    print("\n=== Testing JS Execution ===")
    request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
        "priority": 8,
        "js_code": [
            "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
@@ -151,7 +151,7 @@ def test_js_execution(tester: Crawl4AiTester):
 def test_css_selector(tester: Crawl4AiTester):
    print("\n=== Testing CSS Selector ===")
    request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
        "priority": 7,
        "css_selector": ".wide-tease-item__description",
        "crawler_params": {"headless": True},
@@ -188,7 +188,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
    }
    request = {
-        "urls": "https://www.coinbase.com/explore",
+        "urls": ["https://www.coinbase.com/explore"],
        "priority": 9,
        "extraction_config": {"type": "json_css", "params": {"schema": schema}},
    }
@@ -223,7 +223,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
    }
    request = {
-        "urls": "https://openai.com/api/pricing",
+        "urls": ["https://openai.com/api/pricing"],
        "priority": 8,
        "extraction_config": {
            "type": "llm",
@@ -270,7 +270,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
    }
    request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
        "priority": 8,
        "extraction_config": {
            "type": "llm",
@@ -297,7 +297,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
 def test_cosine_extraction(tester: Crawl4AiTester):
    print("\n=== Testing Cosine Extraction ===")
    request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
        "priority": 8,
        "extraction_config": {
            "type": "cosine",
@@ -323,7 +323,7 @@ def test_cosine_extraction(tester: Crawl4AiTester):
 def test_screenshot(tester: Crawl4AiTester):
    print("\n=== Testing Screenshot ===")
    request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
        "priority": 5,
        "screenshot": True,
        "crawler_params": {"headless": True},
--- a/tests/test_docker.py
+++ b/tests/test_docker.py
@@ -74,7 +74,7 @@ def test_docker_deployment(version="basic"):
 def test_basic_crawl(tester: Crawl4AiTester):
    print("\n=== Testing Basic Crawl ===")
-    request = {"urls": "https://www.nbcnews.com/business", "priority": 10}
+    request = {"urls": ["https://www.nbcnews.com/business"], "priority": 10}
    result = tester.submit_and_wait(request)
    print(f"Basic crawl result length: {len(result['result']['markdown'])}")
@@ -85,7 +85,7 @@ def test_basic_crawl(tester: Crawl4AiTester):
 def test_js_execution(tester: Crawl4AiTester):
    print("\n=== Testing JS Execution ===")
    request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
        "priority": 8,
        "js_code": [
            "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
@@ -102,7 +102,7 @@ def test_js_execution(tester: Crawl4AiTester):
 def test_css_selector(tester: Crawl4AiTester):
    print("\n=== Testing CSS Selector ===")
    request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
        "priority": 7,
        "css_selector": ".wide-tease-item__description",
        "crawler_params": {"headless": True},
@@ -139,7 +139,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
    }
    request = {
-        "urls": "https://www.coinbase.com/explore",
+        "urls": ["https://www.coinbase.com/explore"],
        "priority": 9,
        "extraction_config": {"type": "json_css", "params": {"schema": schema}},
    }
@@ -174,7 +174,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
    }
    request = {
-        "urls": "https://openai.com/api/pricing",
+        "urls": ["https://openai.com/api/pricing"],
        "priority": 8,
        "extraction_config": {
            "type": "llm",
@@ -221,7 +221,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
    }
    request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
        "priority": 8,
        "extraction_config": {
            "type": "llm",
@@ -248,7 +248,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
 def test_cosine_extraction(tester: Crawl4AiTester):
    print("\n=== Testing Cosine Extraction ===")
    request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
        "priority": 8,
        "extraction_config": {
            "type": "cosine",
@@ -274,7 +274,7 @@ def test_cosine_extraction(tester: Crawl4AiTester):
 def test_screenshot(tester: Crawl4AiTester):
    print("\n=== Testing Screenshot ===")
    request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
        "priority": 5,
        "screenshot": True,
        "crawler_params": {"headless": True},
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -54,7 +54,7 @@ class NBCNewsAPITest:
 async def test_basic_crawl():
    print("\n=== Testing Basic Crawl ===")
    async with NBCNewsAPITest() as api:
-        request = {"urls": "https://www.nbcnews.com/business", "priority": 10}
+        request = {"urls": ["https://www.nbcnews.com/business"], "priority": 10}
        task_id = await api.submit_crawl(request)
        result = await api.wait_for_task(task_id)
        print(f"Basic crawl result length: {len(result['result']['markdown'])}")
@@ -67,7 +67,7 @@ async def test_js_execution():
    print("\n=== Testing JS Execution ===")
    async with NBCNewsAPITest() as api:
        request = {
-            "urls": "https://www.nbcnews.com/business",
+            "urls": ["https://www.nbcnews.com/business"],
            "priority": 8,
            "js_code": [
                "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
@@ -86,7 +86,7 @@ async def test_css_selector():
    print("\n=== Testing CSS Selector ===")
    async with NBCNewsAPITest() as api:
        request = {
-            "urls": "https://www.nbcnews.com/business",
+            "urls": ["https://www.nbcnews.com/business"],
            "priority": 7,
            "css_selector": ".wide-tease-item__description",
        }
@@ -120,7 +120,7 @@ async def test_structured_extraction():
        }
        request = {
-            "urls": "https://www.nbcnews.com/business",
+            "urls": ["https://www.nbcnews.com/business"],
            "priority": 9,
            "extraction_config": {"type": "json_css", "params": {"schema": schema}},
        }
@@ -177,7 +177,7 @@ async def test_llm_extraction():
        }
        request = {
-            "urls": "https://www.nbcnews.com/business",
+            "urls": ["https://www.nbcnews.com/business"],
            "priority": 8,
            "extraction_config": {
                "type": "llm",
@@ -209,7 +209,7 @@ async def test_screenshot():
    print("\n=== Testing Screenshot ===")
    async with NBCNewsAPITest() as api:
        request = {
-            "urls": "https://www.nbcnews.com/business",
+            "urls": ["https://www.nbcnews.com/business"],
            "priority": 5,
            "screenshot": True,
            "crawler_params": {"headless": True},
@@ -227,7 +227,7 @@ async def test_priority_handling():
    async with NBCNewsAPITest() as api:
        # Submit low priority task first
        low_priority = {
-            "urls": "https://www.nbcnews.com/business",
+            "urls": ["https://www.nbcnews.com/business"],
            "priority": 1,
            "crawler_params": {"headless": True},
        }
@@ -235,7 +235,7 @@ async def test_priority_handling():
        # Submit high priority task
        high_priority = {
-            "urls": "https://www.nbcnews.com/business/consumer",
+            "urls": ["https://www.nbcnews.com/business/consumer"],
            "priority": 10,
            "crawler_params": {"headless": True},
        }
--- a/tests/test_normalize_url.py
+++ b/tests/test_normalize_url.py
@@ -0,0 +1,91 @@
 import unittest
 from crawl4ai.utils import normalize_url
 class TestNormalizeUrl(unittest.TestCase):
    def test_basic_relative_path(self):
        self.assertEqual(normalize_url("path/to/page.html", "http://example.com/base/"), "http://example.com/base/path/to/page.html")
    def test_base_url_with_trailing_slash(self):
        self.assertEqual(normalize_url("page.html", "http://example.com/base/"), "http://example.com/base/page.html")
    def test_base_url_without_trailing_slash(self):
        # If normalize_url correctly uses urljoin, "base" is treated as a file.
        self.assertEqual(normalize_url("page.html", "http://example.com/base"), "http://example.com/page.html")
    def test_absolute_url_as_href(self):
        self.assertEqual(normalize_url("http://another.com/page.html", "http://example.com/"), "http://another.com/page.html")
    def test_href_with_leading_trailing_spaces(self):
        self.assertEqual(normalize_url("  page.html  ", "http://example.com/"), "http://example.com/page.html")
    def test_empty_href(self):
        # urljoin with an empty href and base ending in '/' returns the base.
        self.assertEqual(normalize_url("", "http://example.com/base/"), "http://example.com/base/")
        # urljoin with an empty href and base not ending in '/' also returns base.
        self.assertEqual(normalize_url("", "http://example.com/base"), "http://example.com/base")
    def test_href_with_query_parameters(self):
        self.assertEqual(normalize_url("page.html?query=test", "http://example.com/"), "http://example.com/page.html?query=test")
    def test_href_with_fragment(self):
        self.assertEqual(normalize_url("page.html#section", "http://example.com/"), "http://example.com/page.html#section")
    def test_different_scheme_in_href(self):
        self.assertEqual(normalize_url("https://secure.example.com/page.html", "http://example.com/"), "https://secure.example.com/page.html")
    def test_parent_directory_in_href(self):
        self.assertEqual(normalize_url("../otherpage.html", "http://example.com/base/current/"), "http://example.com/base/otherpage.html")
    def test_root_relative_href(self):
        self.assertEqual(normalize_url("/otherpage.html", "http://example.com/base/current/"), "http://example.com/otherpage.html")
    def test_base_url_with_path_and_no_trailing_slash(self):
        # If normalize_url correctly uses urljoin, "path" is treated as a file.
        self.assertEqual(normalize_url("file.html", "http://example.com/path"), "http://example.com/file.html")
    def test_base_url_is_just_domain(self):
        self.assertEqual(normalize_url("page.html", "http://example.com"), "http://example.com/page.html")
    def test_href_is_only_query(self):
        self.assertEqual(normalize_url("?query=true", "http://example.com/page.html"), "http://example.com/page.html?query=true")
    def test_href_is_only_fragment(self):
        self.assertEqual(normalize_url("#fragment", "http://example.com/page.html"), "http://example.com/page.html#fragment")
    def test_relative_link_from_base_file_url(self):
        """
        Tests the specific bug report: relative links from a base URL that is a file.
        Example:
        Page URL: http://example.com/path/to/document.html
        Link on page: <a href="./file.xlsx">
        Expected: http://example.com/path/to/file.xlsx
        """
        base_url_file = "http://example.com/zwgk/fdzdgk/zdxx/spaq/t19360680.shtml"
        href_relative_current_dir = "./P020241203375994691134.xlsx"
        expected_url1 = "http://example.com/zwgk/fdzdgk/zdxx/spaq/P020241203375994691134.xlsx"
        self.assertEqual(normalize_url(href_relative_current_dir, base_url_file), expected_url1)
        # Test with a relative link that doesn't start with "./"
        href_relative_no_dot_slash = "another.doc"
        expected_url2 = "http://example.com/zwgk/fdzdgk/zdxx/spaq/another.doc"
        self.assertEqual(normalize_url(href_relative_no_dot_slash, base_url_file), expected_url2)
    def test_invalid_base_url_scheme(self):
        with self.assertRaises(ValueError) as context:
            normalize_url("page.html", "ftp://example.com/")
        self.assertIn("Invalid base URL format", str(context.exception))
    def test_invalid_base_url_netloc(self):
        with self.assertRaises(ValueError) as context:
            normalize_url("page.html", "http:///path/")
        self.assertIn("Invalid base URL format", str(context.exception))
    def test_base_url_with_port(self):
        self.assertEqual(normalize_url("path/file.html", "http://example.com:8080/base/"), "http://example.com:8080/base/path/file.html")
    def test_href_with_special_characters(self):
        self.assertEqual(normalize_url("path%20with%20spaces/file.html", "http://example.com/"), "http://example.com/path%20with%20spaces/file.html")
 if __name__ == '__main__':
    unittest.main()