Merge pull request #2 from aravindkarnam/staging

Staging
2024-09-18 18:16:23 +05:30
parent 44ce12c62c d628bc4034
commit 78f26ac263
7 changed files with 54 additions and 12 deletions
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -12,6 +12,7 @@ import json, uuid
 import hashlib
 from pathlib import Path
 from playwright.async_api import ProxySettings
+from pydantic import BaseModel

 def calculate_semaphore_count():
    cpu_count = os.cpu_count()
@@ -20,13 +21,18 @@ def calculate_semaphore_count():
    memory_based_cap = int(memory_gb / 2)  # Assume 2GB per instance
    return min(base_count, memory_based_cap)

+class AsyncCrawlResponse(BaseModel):
+    html: str
+    response_headers: Dict[str, str]
+    status_code: int
+
 class AsyncCrawlerStrategy(ABC):
    @abstractmethod
-    async def crawl(self, url: str, **kwargs) -> str:
+    async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
        pass
    
    @abstractmethod
-    async def crawl_many(self, urls: List[str], **kwargs) -> List[str]:
+    async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
        pass
    
    @abstractmethod
@@ -140,7 +146,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        for sid in expired_sessions:
            asyncio.create_task(self.kill_session(sid))

-    async def crawl(self, url: str, **kwargs) -> str:
+    async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
+        response_headers = {}
+        status_code = None
+        
        self._cleanup_expired_sessions()
        session_id = kwargs.get("session_id")
        if session_id:
@@ -168,13 +177,25 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            if self.use_cached_html:
                cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest())
                if os.path.exists(cache_file_path):
+                    html = ""
                    with open(cache_file_path, "r") as f:
-                        return f.read()
+                        html = f.read()
+                    # retrieve response headers and status code from cache
+                    with open(cache_file_path + ".meta", "r") as f:
+                        meta = json.load(f)
+                        response_headers = meta.get("response_headers", {})
+                        status_code = meta.get("status_code")
+                    response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
+                    return response

            if not kwargs.get("js_only", False):
                await self.execute_hook('before_goto', page)
-                await page.goto(url, wait_until="domcontentloaded", timeout=60000)
+                response = await page.goto(url, wait_until="domcontentloaded", timeout=60000)
                await self.execute_hook('after_goto', page)
+                
+                # Get status code and headers
+                status_code = response.status
+                response_headers = response.headers

            await page.wait_for_selector('body')
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
@@ -202,8 +223,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest())
                with open(cache_file_path, "w", encoding="utf-8") as f:
                    f.write(html)
+                # store response headers and status code in cache
+                with open(cache_file_path + ".meta", "w", encoding="utf-8") as f:
+                    json.dump({
+                        "response_headers": response_headers,
+                        "status_code": status_code
+                    }, f)

-            return html
+            response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
+            return response
        except Error as e:
            raise Error(f"Failed to crawl {url}: {str(e)}")
        finally:
@@ -218,7 +246,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        # except Exception as e:
        #     raise Exception(f"Failed to crawl {url}: {str(e)}")

-    async def crawl_many(self, urls: List[str], **kwargs) -> List[str]:
+    async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
        semaphore_count = kwargs.get('semaphore_count', calculate_semaphore_count())
        semaphore = asyncio.Semaphore(semaphore_count)

--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -8,7 +8,7 @@ from .models import CrawlResult
 from .async_database import async_db_manager
 from .chunking_strategy import *
 from .extraction_strategy import *
-from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy
+from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
 from .content_scrapping_strategy import WebScrappingStrategy
 from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
 from .utils import (
@@ -101,7 +101,8 @@ class AsyncWebCrawler:
                t1 = time.time()
                if user_agent:
                    self.crawler_strategy.update_user_agent(user_agent)
-                html = await self.crawler_strategy.crawl(url, **kwargs)
+                async_response : AsyncCrawlResponse = await self.crawler_strategy.crawl(url, **kwargs)
+                html = sanitize_input_encode(async_response.html)
                t2 = time.time()
                if verbose:
                    print(
@@ -121,8 +122,11 @@ class AsyncWebCrawler:
                screenshot_data,
                verbose,
                bool(cached),
+                async_response=async_response,
                **kwargs,
            )
+            crawl_result.status_code = async_response.status_code
+            crawl_result.responser_headers = async_response.response_headers
            crawl_result.success = bool(html)
            crawl_result.session_id = kwargs.get("session_id", None)
            return crawl_result
--- a/crawl4ai/content_scrapping_strategy.py
+++ b/crawl4ai/content_scrapping_strategy.py
@@ -43,6 +43,10 @@ class WebScrappingStrategy(ContentScrappingStrategy):
        
        image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)

+        for tag in kwargs.get('excluded_tags', []) or []:
+            for el in body.select(tag):
+                el.decompose()
+        
        if css_selector:
            selected_elements = body.select(css_selector)
            if not selected_elements:
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -17,4 +17,6 @@ class CrawlResult(BaseModel):
    extracted_content: Optional[str] = None
    metadata: Optional[dict] = None
    error_message: Optional[str] = None
-    session_id: Optional[str] = None
+    session_id: Optional[str] = None
+    responser_headers: Optional[dict] = None
+    status_code: Optional[int] = None
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -441,6 +441,10 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
    
    image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)

+    for tag in kwargs.get('excluded_tags', []) or []:
+        for el in body.select(tag):
+            el.decompose()
+        
    if css_selector:
        selected_elements = body.select(css_selector)
        if not selected_elements:
--- a/pages/index_pooling.html
+++ b/pages/index_pooling.html
@@ -50,7 +50,7 @@
    <body>
        <header class="bg-gray-900 text-white py-4">
            <div class="container mx-auto px-4">
-                <h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Scrapper</h1>
+                <h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Open-source LLM Friendly Web scraper</h1>
            </div>
        </header>

--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@ transformer_requirements = [req for req in requirements if req.startswith(("tran
 setup(
    name="Crawl4AI",
    version="0.2.77",
-    description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
+    description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
    long_description=open("README.md", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
    url="https://github.com/unclecode/crawl4ai",