chore(git): update gitignore patterns

Add new development and tooling related patterns to gitignore: - Add Next.js build directory (.next/) - Add various script and documentation files - Add local development directories (.local, .scripts, .do) - Add tool-specific files (.codeiumignore, .windsurfrules) Removes duplicate entries and organizes patterns more clearly.
Update .gitignore to include new directories for issues and documentation
2025-01-22 17:22:26 +08:00 · 2024-11-06 18:44:03 +08:00 · 2024-11-06 07:00:44 +01:00 · 2024-10-17 15:42:43 +05:30 · 2024-10-17 12:25:17 +05:30 · 2024-10-16 22:36:48 +05:30
13 changed files with 289 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -199,8 +199,35 @@ test_env/
 **/.DS_Store

 todo.md
+todo_executor.md
 git_changes.py
 git_changes.md
 pypi_build.sh
+git_issues.py
+git_issues.md

-.tests/
+.next/
+.tests/
+# .issues/
+.docs/
+.issues/
+.gitboss/
+todo_executor.md
+protect-all-except-feature.sh
+manage-collab.sh
+publish.sh
+combine.sh
+combined_output.txt
+.local
+.scripts
+tree.md
+tree.md
+.scripts
+.local
+.do
+/plans
+.codeiumignore
+todo/
+
+# windsurf rules
+.windsurfrules
--- a/crawl4ai/scraper/init.py
+++ b/crawl4ai/scraper/init.py
@@ -0,0 +1,2 @@
+from .async_web_scraper import AsyncWebScraper
+from .bfs_scraper_strategy import BFSScraperStrategy
--- a/crawl4ai/scraper/async_web_scraper.py
+++ b/crawl4ai/scraper/async_web_scraper.py
@@ -0,0 +1,33 @@
+from .scraper_strategy import ScraperStrategy
+from .models import ScraperResult, CrawlResult
+from ..async_webcrawler import AsyncWebCrawler
+from typing import Union, AsyncGenerator
+
+class AsyncWebScraper:
+    def __init__(self, crawler: AsyncWebCrawler, strategy: ScraperStrategy):
+        self.crawler = crawler
+        self.strategy = strategy
+
+    async def ascrape(self, url: str, parallel_processing: bool = True, stream: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
+        if stream:
+            return self._ascrape_yielding(url, parallel_processing)
+        else:
+            return await self._ascrape_collecting(url, parallel_processing)
+
+    async def _ascrape_yielding(self, url: str, parallel_processing: bool) -> AsyncGenerator[CrawlResult, None]:
+        result_generator = self.strategy.ascrape(url, self.crawler, parallel_processing)
+        async for res in result_generator:  # Consume the async generator
+            yield res  # Yielding individual results
+
+    async def _ascrape_collecting(self, url: str, parallel_processing: bool) -> ScraperResult:
+        extracted_data = {}
+        result_generator = self.strategy.ascrape(url, self.crawler, parallel_processing)
+        async for res in result_generator:  # Consume the async generator
+            extracted_data[res.url] = res
+
+        # Return a final ScraperResult
+        return ScraperResult(
+            url=url,
+            crawled_urls=list(extracted_data.keys()),
+            extracted_data=extracted_data
+        )
--- a/crawl4ai/scraper/bfs_scraper_strategy.py
+++ b/crawl4ai/scraper/bfs_scraper_strategy.py
@@ -0,0 +1,139 @@
+from .scraper_strategy import ScraperStrategy
+from .filters import FilterChain
+from .scorers import URLScorer
+from ..models import CrawlResult
+from ..async_webcrawler import AsyncWebCrawler
+import asyncio
+import validators
+from urllib.parse import urljoin,urlparse,urlunparse
+from urllib.robotparser import RobotFileParser
+import time
+from aiolimiter import AsyncLimiter
+from tenacity import retry, stop_after_attempt, wait_exponential
+from collections import defaultdict
+import logging
+from typing import Dict, AsyncGenerator
+logging.basicConfig(level=logging.DEBUG)
+
+rate_limiter = AsyncLimiter(1, 1)  # 1 request per second
+
+class BFSScraperStrategy(ScraperStrategy):
+    def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer, max_concurrent: int = 5, min_crawl_delay: int=1):
+        self.max_depth = max_depth
+        self.filter_chain = filter_chain
+        self.url_scorer = url_scorer
+        self.max_concurrent = max_concurrent
+        # For Crawl Politeness
+        self.last_crawl_time = defaultdict(float)
+        self.min_crawl_delay = min_crawl_delay  # 1 second delay between requests to the same domain
+        # For Robots.txt Compliance
+        self.robot_parsers = {}
+
+    # Robots.txt Parser
+    def get_robot_parser(self, url: str) -> RobotFileParser:
+        domain = urlparse(url)
+        scheme = domain.scheme if domain.scheme else 'http'  # Default to 'http' if no scheme provided
+        netloc = domain.netloc
+        if netloc not in self.robot_parsers:
+            rp = RobotFileParser()
+            rp.set_url(f"{scheme}://{netloc}/robots.txt")
+            try:
+                rp.read()
+            except Exception as e:
+                # Log the type of error, message, and the URL
+                logging.warning(f"Error {type(e).__name__} occurred while fetching robots.txt for {netloc}: {e}")
+                return None
+            self.robot_parsers[netloc] = rp
+        return self.robot_parsers[netloc]
+
+    
+    # Retry with exponential backoff
+    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
+    async def retry_crawl(self, crawler: AsyncWebCrawler, url: str) -> CrawlResult:
+        return await crawler.arun(url)
+    
+    async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set, depths: Dict[str, int]) -> AsyncGenerator[CrawlResult, None]:
+        def normalize_url(url: str) -> str:
+            parsed = urlparse(url)
+            return urlunparse(parsed._replace(fragment=""))
+        
+        # URL Validation
+        if not validators.url(url):
+            logging.warning(f"Invalid URL: {url}")
+            return None
+        
+        # Robots.txt Compliance
+        robot_parser = self.get_robot_parser(url)
+        if robot_parser is None:
+            logging.info(f"Could not retrieve robots.txt for {url}, hence proceeding with crawl.")
+        else:
+            # If robots.txt was fetched, check if crawling is allowed
+            if not robot_parser.can_fetch(crawler.crawler_strategy.user_agent, url):
+                logging.info(f"Skipping {url} as per robots.txt")
+                return None
+    
+        # Crawl Politeness
+        domain = urlparse(url).netloc
+        time_since_last_crawl = time.time() - self.last_crawl_time[domain]
+        if time_since_last_crawl < self.min_crawl_delay:
+            await asyncio.sleep(self.min_crawl_delay - time_since_last_crawl)
+        self.last_crawl_time[domain] = time.time()
+
+        # Rate Limiting
+        async with rate_limiter:
+            # Error Handling
+            try:
+                crawl_result = await self.retry_crawl(crawler, url)
+            except Exception as e:
+                logging.error(f"Error crawling {url}: {str(e)}")
+                crawl_result = CrawlResult(url=url, html="", success=False, status_code=0, error_message=str(e))
+        
+        if not crawl_result.success:
+            # Logging and Monitoring
+            logging.error(f"Failed to crawl URL: {url}. Error: {crawl_result.error_message}")
+            return crawl_result
+
+        # Process links
+        for link_type in ["internal", "external"]:
+            for link in crawl_result.links[link_type]:
+                absolute_link = urljoin(url, link['href'])
+                normalized_link = normalize_url(absolute_link)
+                if self.filter_chain.apply(normalized_link) and normalized_link not in visited:
+                    new_depth = depths[url] + 1
+                    if new_depth <= self.max_depth:
+                        # URL Scoring
+                        score = self.url_scorer.score(normalized_link)
+                        await queue.put((score, new_depth, normalized_link))
+                        depths[normalized_link] = new_depth
+        return crawl_result
+
+    async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> AsyncGenerator[CrawlResult,None]:
+        queue = asyncio.PriorityQueue()
+        queue.put_nowait((0, 0, start_url))
+        visited = set()
+        depths = {start_url: 0}
+        pending_tasks = set()
+
+        while not queue.empty() or pending_tasks:
+            while not queue.empty() and len(pending_tasks) < self.max_concurrent:
+                _, depth, url = await queue.get()
+                if url not in visited:
+                    # Adding URL to the visited set here itself, (instead of after result generation)
+                    # so that other tasks are not queued for same URL, found at different depth before
+                    # crawling and extraction of this task is completed.
+                    visited.add(url)
+                    if parallel_processing:
+                        task = asyncio.create_task(self.process_url(url, depth, crawler, queue, visited, depths))
+                        pending_tasks.add(task)
+                    else:
+                        result = await self.process_url(url, depth, crawler, queue, visited, depths)
+                        if result:
+                            yield result 
+
+            # Wait for the first task to complete and yield results incrementally as each task is completed
+            if pending_tasks:
+                done, pending_tasks = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED)
+                for task in done:
+                    result = await task
+                    if result:
+                        yield result
--- a/crawl4ai/scraper/filters/init.py
+++ b/crawl4ai/scraper/filters/init.py
@@ -0,0 +1,3 @@
+from .url_filter import URLFilter, FilterChain
+from .content_type_filter import ContentTypeFilter
+from .url_pattern_filter import URLPatternFilter
--- a/crawl4ai/scraper/filters/content_type_filter.py
+++ b/crawl4ai/scraper/filters/content_type_filter.py
@@ -0,0 +1,8 @@
+from .url_filter import URLFilter
+
+class ContentTypeFilter(URLFilter):
+    def __init__(self, contentType: str):
+        self.contentType = contentType
+    def apply(self, url: str) -> bool:
+        #TODO: This is a stub. Will implement this later
+        return True
--- a/crawl4ai/scraper/filters/url_filter.py
+++ b/crawl4ai/scraper/filters/url_filter.py
@@ -0,0 +1,16 @@
+from abc import ABC, abstractmethod
+
+class URLFilter(ABC):
+    @abstractmethod
+    def apply(self, url: str) -> bool:
+        pass
+
+class FilterChain:
+    def __init__(self):
+        self.filters = []
+
+    def add_filter(self, filter: URLFilter):
+        self.filters.append(filter)
+
+    def apply(self, url: str) -> bool:
+        return all(filter.apply(url) for filter in self.filters)
--- a/crawl4ai/scraper/filters/url_pattern_filter.py
+++ b/crawl4ai/scraper/filters/url_pattern_filter.py
@@ -0,0 +1,9 @@
+from .url_filter import URLFilter
+from re import Pattern
+
+class URLPatternFilter(URLFilter):
+    def __init__(self, pattern: Pattern):
+        self.pattern = pattern
+    def apply(self, url: str) -> bool:
+        #TODO: This is a stub. Will implement this later.
+        return True
--- a/crawl4ai/scraper/models.py
+++ b/crawl4ai/scraper/models.py
@@ -0,0 +1,8 @@
+from pydantic import BaseModel
+from typing import List, Dict
+from ..models import CrawlResult
+
+class ScraperResult(BaseModel):
+    url: str
+    crawled_urls: List[str]
+    extracted_data: Dict[str,CrawlResult]
--- a/crawl4ai/scraper/scorers/init.py
+++ b/crawl4ai/scraper/scorers/init.py
@@ -0,0 +1,2 @@
+from .url_scorer import URLScorer
+from .keyword_relevance_scorer import KeywordRelevanceScorer
--- a/crawl4ai/scraper/scorers/keyword_relevance_scorer.py
+++ b/crawl4ai/scraper/scorers/keyword_relevance_scorer.py
@@ -0,0 +1,9 @@
+from .url_scorer import URLScorer
+from typing import List
+
+class KeywordRelevanceScorer(URLScorer):
+    def __init__(self,keywords: List[str]):
+        self.keyworkds = keywords
+    def score(self, url: str) -> float:
+        #TODO: This is a stub. Will implement this later.
+        return 1
--- a/crawl4ai/scraper/scorers/url_scorer.py
+++ b/crawl4ai/scraper/scorers/url_scorer.py
@@ -0,0 +1,6 @@
+from abc import ABC, abstractmethod
+
+class URLScorer(ABC):
+    @abstractmethod
+    def score(self, url: str) -> float:
+        pass
--- a/crawl4ai/scraper/scraper_strategy.py
+++ b/crawl4ai/scraper/scraper_strategy.py
@@ -0,0 +1,26 @@
+from abc import ABC, abstractmethod
+from .models import ScraperResult, CrawlResult
+from ..models import CrawlResult
+from ..async_webcrawler import AsyncWebCrawler
+from typing import Union, AsyncGenerator
+
+class ScraperStrategy(ABC):
+    @abstractmethod
+    async def ascrape(self, url: str, crawler: AsyncWebCrawler, parallel_processing: bool = True, stream: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
+        """Scrape the given URL using the specified crawler.
+
+        Args:
+            url (str): The starting URL for the scrape.
+            crawler (AsyncWebCrawler): The web crawler instance.
+            parallel_processing (bool): Whether to use parallel processing. Defaults to True.
+            stream (bool): If True, yields individual crawl results as they are ready; 
+                                if False, accumulates results and returns a final ScraperResult.
+
+        Yields:
+            CrawlResult: Individual crawl results if stream is True.
+
+        Returns:
+            ScraperResult: A summary of the scrape results containing the final extracted data 
+            and the list of crawled URLs if stream is False.
+        """
+        pass
Author	SHA1	Message	Date
UncleCode	d21ffad3a2	chore(git): update gitignore patterns Add new development and tooling related patterns to gitignore: - Add Next.js build directory (.next/) - Add various script and documentation files - Add local development directories (.local, .scripts, .do) - Add tool-specific files (.codeiumignore, .windsurfrules) Removes duplicate entries and organizes patterns more clearly.	2025-01-22 17:22:26 +08:00
UncleCode	06b21dcc50	Update .gitignore to include new directories for issues and documentation	2024-11-06 18:44:03 +08:00
UncleCode	0f0f60527d	Merge pull request #172 from aravindkarnam/scraper Scraper	2024-11-06 07:00:44 +01:00
Aravind Karnam	8105fd178e	Removed stubs for remove_from_future_crawls since the visited set is updated soon as the URL was queued, Removed add_to_retry_queue(url) since retry with exponential backoff with help of tenacity is going to take care of it.	2024-10-17 15:42:43 +05:30
Aravind Karnam	ce7fce4b16	1. Moved to asyncio.wait instead of gather so that results can be yeilded just as they are ready, rather than in batches 2. Moved the visted.add(url), to before the task is put in queue rather than after the crawl is completed. This makes sure that duplicate crawls doesn't happen when same URL is found at different depth and that get's queued too because the crawl is not yet completed and visted set is not updated. 3. Named the yield_results attribute to stream instead. Since that seems to be popularly used in all other AI libraries for intermediate results.	2024-10-17 12:25:17 +05:30
Aravind Karnam	de28b59aca	removed unused imports	2024-10-16 22:36:48 +05:30
Aravind Karnam	04d8b47b92	Exposed min_crawl_delay for BFSScraperStrategy	2024-10-16 22:34:54 +05:30
Aravind Karnam	2943feeecf	1. Added a flag to yield each crawl result,as they become ready along with the final scraper result as another option 2. Removed ascrape_many method, as I'm currently not focusing on it in the first cut of scraper 3. Added some error handling for cases where robots.txt cannot be fetched or parsed.	2024-10-16 22:05:29 +05:30
Aravind Karnam	8a7d29ce85	updated some comments and removed content type checking functionality from core as it's implemented as a filter	2024-10-16 15:59:37 +05:30
aravind	159bd875bd	Merge pull request #5 from aravindkarnam/main Merging 0.3.6	2024-10-16 10:41:22 +05:30
Aravind Karnam	d743adac68	Fixed some bugs in robots.txt processing	2024-10-03 15:58:57 +05:30
Aravind Karnam	7fe220dbd5	1. Introduced a bool flag to ascrape method to switch between sequential and concurrent processing 2. Introduced a dictionary for depth tracking across various tasks 3. Removed redundancy with crawled_urls variable. Instead created a list with visited set variable in returned object.	2024-10-03 11:17:11 +05:30
aravind	65e013d9d1	Merge pull request #3 from aravindkarnam/main Merging latest changes from main branch	2024-10-03 09:52:12 +05:30
Aravind Karnam	7f3e2e47ed	Parallel processing with retry on failure with exponential backoff - Simplified URL validation and normalisation - respecting Robots.txt	2024-09-19 12:34:12 +05:30
aravind	78f26ac263	Merge pull request #2 from aravindkarnam/staging Staging	2024-09-18 18:16:23 +05:30
Aravind Karnam	44ce12c62c	Created scaffolding for Scraper as per the plan. Implemented the ascrape method in bfs_scraper_strategy	2024-09-09 13:13:34 +05:30