Update gitignore

Merge branch 'main' of https://github.com/unclecode/crawl4ai
Merge pull request #194 from IdrisHanafi/feat/customize-crawl-base-directory
2024-10-27 19:29:04 +08:00 · 2024-10-27 19:28:05 +08:00 · 2024-10-24 13:09:27 +02:00 · 2024-10-21 17:58:39 -04:00 · 2024-10-17 21:38:48 +08:00
14 changed files with 3 additions and 286 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -199,35 +199,12 @@ test_env/
 **/.DS_Store

 todo.md
-todo_executor.md
 git_changes.py
 git_changes.md
 pypi_build.sh
 git_issues.py
 git_issues.md

-.next/
 .tests/
-# .issues/
-.docs/
-.issues/
-.gitboss/
-todo_executor.md
-protect-all-except-feature.sh
-manage-collab.sh
-publish.sh
-combine.sh
-combined_output.txt
-.local
-.scripts
-tree.md
-tree.md
-.scripts
-.local
-.do
-/plans
-.codeiumignore
-todo/

-# windsurf rules
-.windsurfrules
+.issues/
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -23,13 +23,14 @@ class AsyncWebCrawler:
        self,
        crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
        always_by_pass_cache: bool = False,
+        base_directory: str = str(Path.home()),
        **kwargs,
    ):
        self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
            **kwargs
        )
        self.always_by_pass_cache = always_by_pass_cache
-        self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
+        self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
        os.makedirs(self.crawl4ai_folder, exist_ok=True)
        os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
        self.ready = False
--- a/crawl4ai/scraper/init.py
+++ b/crawl4ai/scraper/init.py
@@ -1,2 +0,0 @@
-from .async_web_scraper import AsyncWebScraper
-from .bfs_scraper_strategy import BFSScraperStrategy
--- a/crawl4ai/scraper/async_web_scraper.py
+++ b/crawl4ai/scraper/async_web_scraper.py
@@ -1,33 +0,0 @@
-from .scraper_strategy import ScraperStrategy
-from .models import ScraperResult, CrawlResult
-from ..async_webcrawler import AsyncWebCrawler
-from typing import Union, AsyncGenerator
-
-class AsyncWebScraper:
-    def __init__(self, crawler: AsyncWebCrawler, strategy: ScraperStrategy):
-        self.crawler = crawler
-        self.strategy = strategy
-
-    async def ascrape(self, url: str, parallel_processing: bool = True, stream: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
-        if stream:
-            return self._ascrape_yielding(url, parallel_processing)
-        else:
-            return await self._ascrape_collecting(url, parallel_processing)
-
-    async def _ascrape_yielding(self, url: str, parallel_processing: bool) -> AsyncGenerator[CrawlResult, None]:
-        result_generator = self.strategy.ascrape(url, self.crawler, parallel_processing)
-        async for res in result_generator:  # Consume the async generator
-            yield res  # Yielding individual results
-
-    async def _ascrape_collecting(self, url: str, parallel_processing: bool) -> ScraperResult:
-        extracted_data = {}
-        result_generator = self.strategy.ascrape(url, self.crawler, parallel_processing)
-        async for res in result_generator:  # Consume the async generator
-            extracted_data[res.url] = res
-
-        # Return a final ScraperResult
-        return ScraperResult(
-            url=url,
-            crawled_urls=list(extracted_data.keys()),
-            extracted_data=extracted_data
-        )
--- a/crawl4ai/scraper/bfs_scraper_strategy.py
+++ b/crawl4ai/scraper/bfs_scraper_strategy.py
@@ -1,139 +0,0 @@
-from .scraper_strategy import ScraperStrategy
-from .filters import FilterChain
-from .scorers import URLScorer
-from ..models import CrawlResult
-from ..async_webcrawler import AsyncWebCrawler
-import asyncio
-import validators
-from urllib.parse import urljoin,urlparse,urlunparse
-from urllib.robotparser import RobotFileParser
-import time
-from aiolimiter import AsyncLimiter
-from tenacity import retry, stop_after_attempt, wait_exponential
-from collections import defaultdict
-import logging
-from typing import Dict, AsyncGenerator
-logging.basicConfig(level=logging.DEBUG)
-
-rate_limiter = AsyncLimiter(1, 1)  # 1 request per second
-
-class BFSScraperStrategy(ScraperStrategy):
-    def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer, max_concurrent: int = 5, min_crawl_delay: int=1):
-        self.max_depth = max_depth
-        self.filter_chain = filter_chain
-        self.url_scorer = url_scorer
-        self.max_concurrent = max_concurrent
-        # For Crawl Politeness
-        self.last_crawl_time = defaultdict(float)
-        self.min_crawl_delay = min_crawl_delay  # 1 second delay between requests to the same domain
-        # For Robots.txt Compliance
-        self.robot_parsers = {}
-
-    # Robots.txt Parser
-    def get_robot_parser(self, url: str) -> RobotFileParser:
-        domain = urlparse(url)
-        scheme = domain.scheme if domain.scheme else 'http'  # Default to 'http' if no scheme provided
-        netloc = domain.netloc
-        if netloc not in self.robot_parsers:
-            rp = RobotFileParser()
-            rp.set_url(f"{scheme}://{netloc}/robots.txt")
-            try:
-                rp.read()
-            except Exception as e:
-                # Log the type of error, message, and the URL
-                logging.warning(f"Error {type(e).__name__} occurred while fetching robots.txt for {netloc}: {e}")
-                return None
-            self.robot_parsers[netloc] = rp
-        return self.robot_parsers[netloc]
-
-    
-    # Retry with exponential backoff
-    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
-    async def retry_crawl(self, crawler: AsyncWebCrawler, url: str) -> CrawlResult:
-        return await crawler.arun(url)
-    
-    async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set, depths: Dict[str, int]) -> AsyncGenerator[CrawlResult, None]:
-        def normalize_url(url: str) -> str:
-            parsed = urlparse(url)
-            return urlunparse(parsed._replace(fragment=""))
-        
-        # URL Validation
-        if not validators.url(url):
-            logging.warning(f"Invalid URL: {url}")
-            return None
-        
-        # Robots.txt Compliance
-        robot_parser = self.get_robot_parser(url)
-        if robot_parser is None:
-            logging.info(f"Could not retrieve robots.txt for {url}, hence proceeding with crawl.")
-        else:
-            # If robots.txt was fetched, check if crawling is allowed
-            if not robot_parser.can_fetch(crawler.crawler_strategy.user_agent, url):
-                logging.info(f"Skipping {url} as per robots.txt")
-                return None
-    
-        # Crawl Politeness
-        domain = urlparse(url).netloc
-        time_since_last_crawl = time.time() - self.last_crawl_time[domain]
-        if time_since_last_crawl < self.min_crawl_delay:
-            await asyncio.sleep(self.min_crawl_delay - time_since_last_crawl)
-        self.last_crawl_time[domain] = time.time()
-
-        # Rate Limiting
-        async with rate_limiter:
-            # Error Handling
-            try:
-                crawl_result = await self.retry_crawl(crawler, url)
-            except Exception as e:
-                logging.error(f"Error crawling {url}: {str(e)}")
-                crawl_result = CrawlResult(url=url, html="", success=False, status_code=0, error_message=str(e))
-        
-        if not crawl_result.success:
-            # Logging and Monitoring
-            logging.error(f"Failed to crawl URL: {url}. Error: {crawl_result.error_message}")
-            return crawl_result
-
-        # Process links
-        for link_type in ["internal", "external"]:
-            for link in crawl_result.links[link_type]:
-                absolute_link = urljoin(url, link['href'])
-                normalized_link = normalize_url(absolute_link)
-                if self.filter_chain.apply(normalized_link) and normalized_link not in visited:
-                    new_depth = depths[url] + 1
-                    if new_depth <= self.max_depth:
-                        # URL Scoring
-                        score = self.url_scorer.score(normalized_link)
-                        await queue.put((score, new_depth, normalized_link))
-                        depths[normalized_link] = new_depth
-        return crawl_result
-
-    async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> AsyncGenerator[CrawlResult,None]:
-        queue = asyncio.PriorityQueue()
-        queue.put_nowait((0, 0, start_url))
-        visited = set()
-        depths = {start_url: 0}
-        pending_tasks = set()
-
-        while not queue.empty() or pending_tasks:
-            while not queue.empty() and len(pending_tasks) < self.max_concurrent:
-                _, depth, url = await queue.get()
-                if url not in visited:
-                    # Adding URL to the visited set here itself, (instead of after result generation)
-                    # so that other tasks are not queued for same URL, found at different depth before
-                    # crawling and extraction of this task is completed.
-                    visited.add(url)
-                    if parallel_processing:
-                        task = asyncio.create_task(self.process_url(url, depth, crawler, queue, visited, depths))
-                        pending_tasks.add(task)
-                    else:
-                        result = await self.process_url(url, depth, crawler, queue, visited, depths)
-                        if result:
-                            yield result 
-
-            # Wait for the first task to complete and yield results incrementally as each task is completed
-            if pending_tasks:
-                done, pending_tasks = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED)
-                for task in done:
-                    result = await task
-                    if result:
-                        yield result
--- a/crawl4ai/scraper/filters/init.py
+++ b/crawl4ai/scraper/filters/init.py
@@ -1,3 +0,0 @@
-from .url_filter import URLFilter, FilterChain
-from .content_type_filter import ContentTypeFilter
-from .url_pattern_filter import URLPatternFilter
--- a/crawl4ai/scraper/filters/content_type_filter.py
+++ b/crawl4ai/scraper/filters/content_type_filter.py
@@ -1,8 +0,0 @@
-from .url_filter import URLFilter
-
-class ContentTypeFilter(URLFilter):
-    def __init__(self, contentType: str):
-        self.contentType = contentType
-    def apply(self, url: str) -> bool:
-        #TODO: This is a stub. Will implement this later
-        return True
--- a/crawl4ai/scraper/filters/url_filter.py
+++ b/crawl4ai/scraper/filters/url_filter.py
@@ -1,16 +0,0 @@
-from abc import ABC, abstractmethod
-
-class URLFilter(ABC):
-    @abstractmethod
-    def apply(self, url: str) -> bool:
-        pass
-
-class FilterChain:
-    def __init__(self):
-        self.filters = []
-
-    def add_filter(self, filter: URLFilter):
-        self.filters.append(filter)
-
-    def apply(self, url: str) -> bool:
-        return all(filter.apply(url) for filter in self.filters)
--- a/crawl4ai/scraper/filters/url_pattern_filter.py
+++ b/crawl4ai/scraper/filters/url_pattern_filter.py
@@ -1,9 +0,0 @@
-from .url_filter import URLFilter
-from re import Pattern
-
-class URLPatternFilter(URLFilter):
-    def __init__(self, pattern: Pattern):
-        self.pattern = pattern
-    def apply(self, url: str) -> bool:
-        #TODO: This is a stub. Will implement this later.
-        return True
--- a/crawl4ai/scraper/models.py
+++ b/crawl4ai/scraper/models.py
@@ -1,8 +0,0 @@
-from pydantic import BaseModel
-from typing import List, Dict
-from ..models import CrawlResult
-
-class ScraperResult(BaseModel):
-    url: str
-    crawled_urls: List[str]
-    extracted_data: Dict[str,CrawlResult]
--- a/crawl4ai/scraper/scorers/init.py
+++ b/crawl4ai/scraper/scorers/init.py
@@ -1,2 +0,0 @@
-from .url_scorer import URLScorer
-from .keyword_relevance_scorer import KeywordRelevanceScorer
--- a/crawl4ai/scraper/scorers/keyword_relevance_scorer.py
+++ b/crawl4ai/scraper/scorers/keyword_relevance_scorer.py
@@ -1,9 +0,0 @@
-from .url_scorer import URLScorer
-from typing import List
-
-class KeywordRelevanceScorer(URLScorer):
-    def __init__(self,keywords: List[str]):
-        self.keyworkds = keywords
-    def score(self, url: str) -> float:
-        #TODO: This is a stub. Will implement this later.
-        return 1
--- a/crawl4ai/scraper/scorers/url_scorer.py
+++ b/crawl4ai/scraper/scorers/url_scorer.py
@@ -1,6 +0,0 @@
-from abc import ABC, abstractmethod
-
-class URLScorer(ABC):
-    @abstractmethod
-    def score(self, url: str) -> float:
-        pass
--- a/crawl4ai/scraper/scraper_strategy.py
+++ b/crawl4ai/scraper/scraper_strategy.py
@@ -1,26 +0,0 @@
-from abc import ABC, abstractmethod
-from .models import ScraperResult, CrawlResult
-from ..models import CrawlResult
-from ..async_webcrawler import AsyncWebCrawler
-from typing import Union, AsyncGenerator
-
-class ScraperStrategy(ABC):
-    @abstractmethod
-    async def ascrape(self, url: str, crawler: AsyncWebCrawler, parallel_processing: bool = True, stream: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
-        """Scrape the given URL using the specified crawler.
-
-        Args:
-            url (str): The starting URL for the scrape.
-            crawler (AsyncWebCrawler): The web crawler instance.
-            parallel_processing (bool): Whether to use parallel processing. Defaults to True.
-            stream (bool): If True, yields individual crawl results as they are ready; 
-                                if False, accumulates results and returns a final ScraperResult.
-
-        Yields:
-            CrawlResult: Individual crawl results if stream is True.
-
-        Returns:
-            ScraperResult: A summary of the scrape results containing the final extracted data 
-            and the list of crawled URLs if stream is False.
-        """
-        pass
Author	SHA1	Message	Date
UncleCode	ac9d83c72f	Update gitignore	2024-10-27 19:29:04 +08:00
UncleCode	ff9149b5c9	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-10-27 19:28:05 +08:00
UncleCode	32f57c49d6	Merge pull request #194 from IdrisHanafi/feat/customize-crawl-base-directory Support for custom crawl base directory	2024-10-24 13:09:27 +02:00
Idris Hanafi	a5f627ba1a	feat: customize crawl base directory	2024-10-21 17:58:39 -04:00
UncleCode	dbb587d681	Update gitignore	2024-10-17 21:38:48 +08:00