From 44ce12c62c5c02421cd760c89d8ffda9dd59c208 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Mon, 9 Sep 2024 13:13:34 +0530 Subject: [PATCH 01/28] Created scaffolding for Scraper as per the plan. Implemented the ascrape method in bfs_scraper_strategy --- crawl4ai/scraper/__init__.py | 0 crawl4ai/scraper/async_web_scraper.py | 36 +++++++++++++ crawl4ai/scraper/bfs_scraper_strategy.py | 50 +++++++++++++++++++ crawl4ai/scraper/filters/__init__.py | 3 ++ .../scraper/filters/content_type_filter.py | 8 +++ crawl4ai/scraper/filters/url_filter.py | 16 ++++++ .../scraper/filters/url_pattern_filter.py | 9 ++++ crawl4ai/scraper/models.py | 7 +++ crawl4ai/scraper/scorers/__init__.py | 2 + .../scorers/keyword_relevance_scorer.py | 9 ++++ crawl4ai/scraper/scorers/url_scorer.py | 6 +++ crawl4ai/scraper/scraper_strategy.py | 9 ++++ 12 files changed, 155 insertions(+) create mode 100644 crawl4ai/scraper/__init__.py create mode 100644 crawl4ai/scraper/async_web_scraper.py create mode 100644 crawl4ai/scraper/bfs_scraper_strategy.py create mode 100644 crawl4ai/scraper/filters/__init__.py create mode 100644 crawl4ai/scraper/filters/content_type_filter.py create mode 100644 crawl4ai/scraper/filters/url_filter.py create mode 100644 crawl4ai/scraper/filters/url_pattern_filter.py create mode 100644 crawl4ai/scraper/models.py create mode 100644 crawl4ai/scraper/scorers/__init__.py create mode 100644 crawl4ai/scraper/scorers/keyword_relevance_scorer.py create mode 100644 crawl4ai/scraper/scorers/url_scorer.py create mode 100644 crawl4ai/scraper/scraper_strategy.py diff --git a/crawl4ai/scraper/__init__.py b/crawl4ai/scraper/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/crawl4ai/scraper/async_web_scraper.py b/crawl4ai/scraper/async_web_scraper.py new file mode 100644 index 00000000..c67f0e14 --- /dev/null +++ b/crawl4ai/scraper/async_web_scraper.py @@ -0,0 +1,36 @@ +import asyncio +from typing import List, Dict +from .scraper_strategy import ScraperStrategy +from .bfs_scraper_strategy import BFSScraperStrategy +from .models import ScraperResult +from ..async_webcrawler import AsyncWebCrawler + +class BatchProcessor: + def __init__(self, batch_size: int, concurrency_limit: int): + self.batch_size = batch_size + self.concurrency_limit = concurrency_limit + + async def process_batch(self, scraper: 'AsyncWebScraper', urls: List[str]) -> List[ScraperResult]: + semaphore = asyncio.Semaphore(self.concurrency_limit) + async def scrape_with_semaphore(url): + async with semaphore: + return await scraper.ascrape(url) + return await asyncio.gather(*[scrape_with_semaphore(url) for url in urls]) + +class AsyncWebScraper: + def __init__(self, crawler: AsyncWebCrawler, strategy: ScraperStrategy, batch_size: int = 10, concurrency_limit: int = 5): + self.crawler = crawler + self.strategy = strategy + self.batch_processor = BatchProcessor(batch_size, concurrency_limit) + + async def ascrape(self, url: str) -> ScraperResult: + crawl_result = await self.crawler.arun(url) + return await self.strategy.ascrape(url, crawl_result, self.crawler) + + async def ascrape_many(self, urls: List[str]) -> List[ScraperResult]: + all_results = [] + for i in range(0, len(urls), self.batch_processor.batch_size): + batch = urls[i:i+self.batch_processor.batch_size] + batch_results = await self.batch_processor.process_batch(self, batch) + all_results.extend(batch_results) + return all_results \ No newline at end of file diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py new file mode 100644 index 00000000..9add962e --- /dev/null +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -0,0 +1,50 @@ +from .scraper_strategy import ScraperStrategy +from .filters import FilterChain +from .scorers import URLScorer +from .models import ScraperResult +from ..models import CrawlResult +from ..async_webcrawler import AsyncWebCrawler +import asyncio +from urllib.parse import urljoin + +class BFSScraperStrategy(ScraperStrategy): + def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer): + self.max_depth = max_depth + self.filter_chain = filter_chain + self.url_scorer = url_scorer + + async def ascrape(self, start_url: str, initial_crawl_result: CrawlResult, crawler: AsyncWebCrawler) -> ScraperResult: + queue = asyncio.PriorityQueue() + queue.put_nowait((0, 0, start_url)) # (score, depth, url) + visited = set() + crawled_urls = [] + extracted_data = {} + + while not queue.empty(): + _, depth, url = await queue.get() + if depth > self.max_depth or url in visited: + continue + crawl_result = initial_crawl_result if url == start_url else await crawler.arun(url) + visited.add(url) + crawled_urls.append(url) + extracted_data[url]=crawl_result + if crawl_result.success == False: + print(f"failed to crawl -- {url}") + continue + for internal in crawl_result.links["internal"]: + link = internal['href'] + is_special_uri = any(link.startswith(scheme) for scheme in ('tel:', 'mailto:', 'sms:', 'geo:', 'fax:', 'file:', 'data:', 'sip:', 'ircs:', 'magnet:')) + is_fragment = '#' in link + if not (is_fragment or is_special_uri): + # To fix partial links: eg:'/support' to 'https://example.com/support' + absolute_link = urljoin(url, link) + if self.filter_chain.apply(absolute_link) and absolute_link not in visited: + score = self.url_scorer.score(absolute_link) + await queue.put((1 / score, depth + 1, absolute_link)) + for external in crawl_result.links["external"]: + link = external['href'] + if self.filter_chain.apply(link) and link not in visited: + score = self.url_scorer.score(link) + await queue.put((1 / score, depth + 1, link)) + + return ScraperResult(url=start_url, crawled_urls=crawled_urls, extracted_data=extracted_data) \ No newline at end of file diff --git a/crawl4ai/scraper/filters/__init__.py b/crawl4ai/scraper/filters/__init__.py new file mode 100644 index 00000000..525c9bdb --- /dev/null +++ b/crawl4ai/scraper/filters/__init__.py @@ -0,0 +1,3 @@ +from .url_filter import URLFilter, FilterChain +from .content_type_filter import ContentTypeFilter +from .url_pattern_filter import URLPatternFilter \ No newline at end of file diff --git a/crawl4ai/scraper/filters/content_type_filter.py b/crawl4ai/scraper/filters/content_type_filter.py new file mode 100644 index 00000000..9173eb4a --- /dev/null +++ b/crawl4ai/scraper/filters/content_type_filter.py @@ -0,0 +1,8 @@ +from .url_filter import URLFilter + +class ContentTypeFilter(URLFilter): + def __init__(self, contentType: str): + self.contentType = contentType + def apply(self, url: str) -> bool: + #TODO: This is a stub. Will implement this later + return True \ No newline at end of file diff --git a/crawl4ai/scraper/filters/url_filter.py b/crawl4ai/scraper/filters/url_filter.py new file mode 100644 index 00000000..2b8bd6eb --- /dev/null +++ b/crawl4ai/scraper/filters/url_filter.py @@ -0,0 +1,16 @@ +from abc import ABC, abstractmethod + +class URLFilter(ABC): + @abstractmethod + def apply(self, url: str) -> bool: + pass + +class FilterChain: + def __init__(self): + self.filters = [] + + def add_filter(self, filter: URLFilter): + self.filters.append(filter) + + def apply(self, url: str) -> bool: + return all(filter.apply(url) for filter in self.filters) \ No newline at end of file diff --git a/crawl4ai/scraper/filters/url_pattern_filter.py b/crawl4ai/scraper/filters/url_pattern_filter.py new file mode 100644 index 00000000..fd5df133 --- /dev/null +++ b/crawl4ai/scraper/filters/url_pattern_filter.py @@ -0,0 +1,9 @@ +from .url_filter import URLFilter +from re import Pattern + +class URLPatternFilter(URLFilter): + def __init__(self, pattern: Pattern): + self.pattern = pattern + def apply(self, url: str) -> bool: + #TODO: This is a stub. Will implement this later. + return True \ No newline at end of file diff --git a/crawl4ai/scraper/models.py b/crawl4ai/scraper/models.py new file mode 100644 index 00000000..9ffdac52 --- /dev/null +++ b/crawl4ai/scraper/models.py @@ -0,0 +1,7 @@ +from pydantic import BaseModel +from typing import List, Dict + +class ScraperResult(BaseModel): + url: str + crawled_urls: List[str] + extracted_data: Dict \ No newline at end of file diff --git a/crawl4ai/scraper/scorers/__init__.py b/crawl4ai/scraper/scorers/__init__.py new file mode 100644 index 00000000..05c61c94 --- /dev/null +++ b/crawl4ai/scraper/scorers/__init__.py @@ -0,0 +1,2 @@ +from .url_scorer import URLScorer +from .keyword_relevance_scorer import KeywordRelevanceScorer \ No newline at end of file diff --git a/crawl4ai/scraper/scorers/keyword_relevance_scorer.py b/crawl4ai/scraper/scorers/keyword_relevance_scorer.py new file mode 100644 index 00000000..a2338aec --- /dev/null +++ b/crawl4ai/scraper/scorers/keyword_relevance_scorer.py @@ -0,0 +1,9 @@ +from .url_scorer import URLScorer +from typing import List + +class KeywordRelevanceScorer(URLScorer): + def __init__(self,keywords: List[str]): + self.keyworkds = keywords + def score(self, url: str) -> float: + #TODO: This is a stub. Will implement this later. + return 1 \ No newline at end of file diff --git a/crawl4ai/scraper/scorers/url_scorer.py b/crawl4ai/scraper/scorers/url_scorer.py new file mode 100644 index 00000000..6ee9ab05 --- /dev/null +++ b/crawl4ai/scraper/scorers/url_scorer.py @@ -0,0 +1,6 @@ +from abc import ABC, abstractmethod + +class URLScorer(ABC): + @abstractmethod + def score(self, url: str) -> float: + pass \ No newline at end of file diff --git a/crawl4ai/scraper/scraper_strategy.py b/crawl4ai/scraper/scraper_strategy.py new file mode 100644 index 00000000..16df9ece --- /dev/null +++ b/crawl4ai/scraper/scraper_strategy.py @@ -0,0 +1,9 @@ +from abc import ABC, abstractmethod +from .models import ScraperResult +from ..models import CrawlResult +from ..async_webcrawler import AsyncWebCrawler + +class ScraperStrategy(ABC): + @abstractmethod + async def ascrape(self, url: str, crawl_result: CrawlResult, crawler: AsyncWebCrawler) -> ScraperResult: + pass \ No newline at end of file From 7f3e2e47ed99de1db4bc99d69d0a6f1ddaef962f Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Thu, 19 Sep 2024 12:34:12 +0530 Subject: [PATCH 02/28] Parallel processing with retry on failure with exponential backoff - Simplified URL validation and normalisation - respecting Robots.txt --- crawl4ai/scraper/__init__.py | 2 + crawl4ai/scraper/async_web_scraper.py | 3 +- crawl4ai/scraper/bfs_scraper_strategy.py | 139 ++++++++++++++++++----- crawl4ai/scraper/models.py | 3 +- crawl4ai/scraper/scraper_strategy.py | 2 +- 5 files changed, 116 insertions(+), 33 deletions(-) diff --git a/crawl4ai/scraper/__init__.py b/crawl4ai/scraper/__init__.py index e69de29b..1997e162 100644 --- a/crawl4ai/scraper/__init__.py +++ b/crawl4ai/scraper/__init__.py @@ -0,0 +1,2 @@ +from .async_web_scraper import AsyncWebScraper +from .bfs_scraper_strategy import BFSScraperStrategy \ No newline at end of file diff --git a/crawl4ai/scraper/async_web_scraper.py b/crawl4ai/scraper/async_web_scraper.py index c67f0e14..6cf5488c 100644 --- a/crawl4ai/scraper/async_web_scraper.py +++ b/crawl4ai/scraper/async_web_scraper.py @@ -24,8 +24,7 @@ class AsyncWebScraper: self.batch_processor = BatchProcessor(batch_size, concurrency_limit) async def ascrape(self, url: str) -> ScraperResult: - crawl_result = await self.crawler.arun(url) - return await self.strategy.ascrape(url, crawl_result, self.crawler) + return await self.strategy.ascrape(url, self.crawler) async def ascrape_many(self, urls: List[str]) -> List[ScraperResult]: all_results = [] diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index 9add962e..a8fb1fe1 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -5,46 +5,127 @@ from .models import ScraperResult from ..models import CrawlResult from ..async_webcrawler import AsyncWebCrawler import asyncio -from urllib.parse import urljoin +import validators +from urllib.parse import urljoin,urlparse,urlunparse +from urllib.robotparser import RobotFileParser +import time +from aiolimiter import AsyncLimiter +from tenacity import retry, stop_after_attempt, wait_exponential +from collections import defaultdict +import logging +logging.basicConfig(level=logging.DEBUG) + +rate_limiter = AsyncLimiter(1, 1) # 1 request per second class BFSScraperStrategy(ScraperStrategy): - def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer): + def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer, max_concurrent: int = 5): self.max_depth = max_depth self.filter_chain = filter_chain self.url_scorer = url_scorer + self.max_concurrent = max_concurrent + # 9. Crawl Politeness + self.last_crawl_time = defaultdict(float) + self.min_crawl_delay = 1 # 1 second delay between requests to the same domain + # 5. Robots.txt Compliance + self.robot_parsers = {} + + # Robots.txt Parser + def get_robot_parser(self, url: str) -> RobotFileParser: + domain = urlparse(url).netloc + if domain not in self.robot_parsers: + rp = RobotFileParser() + rp.set_url(f"https://{domain}/robots.txt") + rp.read() + self.robot_parsers[domain] = rp + return self.robot_parsers[domain] + + # Retry with exponential backoff + @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) + async def retry_crawl(self, crawler: AsyncWebCrawler, url: str) -> CrawlResult: + return await crawler.arun(url) + + async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set) -> CrawlResult: + def normalize_url(url: str) -> str: + parsed = urlparse(url) + return urlunparse(parsed._replace(fragment="")) + + # URL Validation + if not validators.url(url): + logging.warning(f"Invalid URL: {url}") + return None + + # Robots.txt Compliance + if not self.get_robot_parser(url).can_fetch("YourUserAgent", url): + logging.info(f"Skipping {url} as per robots.txt") + return None + + # Crawl Politeness + domain = urlparse(url).netloc + time_since_last_crawl = time.time() - self.last_crawl_time[domain] + if time_since_last_crawl < self.min_crawl_delay: + await asyncio.sleep(self.min_crawl_delay - time_since_last_crawl) + self.last_crawl_time[domain] = time.time() - async def ascrape(self, start_url: str, initial_crawl_result: CrawlResult, crawler: AsyncWebCrawler) -> ScraperResult: + # Rate Limiting + async with rate_limiter: + # Error Handling + try: + crawl_result = await self.retry_crawl(crawler, url) + except Exception as e: + logging.error(f"Error crawling {url}: {str(e)}") + crawl_result = CrawlResult(url=url, html="", success=False, status_code=0, error_message=str(e)) + + if not crawl_result.success: + # Logging and Monitoring + logging.error(f"Failed to crawl URL: {url}. Error: {crawl_result.error_message}") + # Error Categorization + if crawl_result.status_code == 404: + self.remove_from_future_crawls(url) + elif crawl_result.status_code == 503: + await self.add_to_retry_queue(url) + return crawl_result + + # Content Type Checking + # if 'text/html' not in crawl_result.response_header.get('Content-Type', ''): + # logging.info(f"Skipping non-HTML content: {url}") + # return crawl_result + + visited.add(url) + + # Process links + for link_type in ["internal", "external"]: + for link in crawl_result.links[link_type]: + absolute_link = urljoin(url, link['href']) + normalized_link = normalize_url(absolute_link) + if self.filter_chain.apply(normalized_link) and normalized_link not in visited: + new_depth = depth + 1 + if new_depth <= self.max_depth: + # URL Scoring + score = self.url_scorer.score(normalized_link) + await queue.put((score, new_depth, normalized_link)) + + return crawl_result + + async def ascrape(self, start_url: str, crawler: AsyncWebCrawler) -> ScraperResult: queue = asyncio.PriorityQueue() - queue.put_nowait((0, 0, start_url)) # (score, depth, url) + queue.put_nowait((0, 0, start_url)) visited = set() crawled_urls = [] extracted_data = {} while not queue.empty(): - _, depth, url = await queue.get() - if depth > self.max_depth or url in visited: - continue - crawl_result = initial_crawl_result if url == start_url else await crawler.arun(url) - visited.add(url) - crawled_urls.append(url) - extracted_data[url]=crawl_result - if crawl_result.success == False: - print(f"failed to crawl -- {url}") - continue - for internal in crawl_result.links["internal"]: - link = internal['href'] - is_special_uri = any(link.startswith(scheme) for scheme in ('tel:', 'mailto:', 'sms:', 'geo:', 'fax:', 'file:', 'data:', 'sip:', 'ircs:', 'magnet:')) - is_fragment = '#' in link - if not (is_fragment or is_special_uri): - # To fix partial links: eg:'/support' to 'https://example.com/support' - absolute_link = urljoin(url, link) - if self.filter_chain.apply(absolute_link) and absolute_link not in visited: - score = self.url_scorer.score(absolute_link) - await queue.put((1 / score, depth + 1, absolute_link)) - for external in crawl_result.links["external"]: - link = external['href'] - if self.filter_chain.apply(link) and link not in visited: - score = self.url_scorer.score(link) - await queue.put((1 / score, depth + 1, link)) + tasks = [] + while not queue.empty() and len(tasks) < self.max_concurrent: + _, depth, url = await queue.get() + if url not in visited: + task = asyncio.create_task(self.process_url(url, depth, crawler, queue, visited)) + tasks.append(task) + + if tasks: + results = await asyncio.gather(*tasks) + for result in results: + if result: + crawled_urls.append(result.url) + extracted_data[result.url] = result return ScraperResult(url=start_url, crawled_urls=crawled_urls, extracted_data=extracted_data) \ No newline at end of file diff --git a/crawl4ai/scraper/models.py b/crawl4ai/scraper/models.py index 9ffdac52..735d1d58 100644 --- a/crawl4ai/scraper/models.py +++ b/crawl4ai/scraper/models.py @@ -1,7 +1,8 @@ from pydantic import BaseModel from typing import List, Dict +from ..models import CrawlResult class ScraperResult(BaseModel): url: str crawled_urls: List[str] - extracted_data: Dict \ No newline at end of file + extracted_data: Dict[str,CrawlResult] \ No newline at end of file diff --git a/crawl4ai/scraper/scraper_strategy.py b/crawl4ai/scraper/scraper_strategy.py index 16df9ece..6d1cdc74 100644 --- a/crawl4ai/scraper/scraper_strategy.py +++ b/crawl4ai/scraper/scraper_strategy.py @@ -5,5 +5,5 @@ from ..async_webcrawler import AsyncWebCrawler class ScraperStrategy(ABC): @abstractmethod - async def ascrape(self, url: str, crawl_result: CrawlResult, crawler: AsyncWebCrawler) -> ScraperResult: + async def ascrape(self, url: str, crawler: AsyncWebCrawler) -> ScraperResult: pass \ No newline at end of file From 7fe220dbd587bc72a0ccaf27ad74af48ca747d6e Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Thu, 3 Oct 2024 11:17:11 +0530 Subject: [PATCH 03/28] 1. Introduced a bool flag to ascrape method to switch between sequential and concurrent processing 2. Introduced a dictionary for depth tracking across various tasks 3. Removed redundancy with crawled_urls variable. Instead created a list with visited set variable in returned object. --- crawl4ai/scraper/async_web_scraper.py | 4 ++-- crawl4ai/scraper/bfs_scraper_strategy.py | 24 +++++++++++++++--------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/crawl4ai/scraper/async_web_scraper.py b/crawl4ai/scraper/async_web_scraper.py index 6cf5488c..fadfa61f 100644 --- a/crawl4ai/scraper/async_web_scraper.py +++ b/crawl4ai/scraper/async_web_scraper.py @@ -23,8 +23,8 @@ class AsyncWebScraper: self.strategy = strategy self.batch_processor = BatchProcessor(batch_size, concurrency_limit) - async def ascrape(self, url: str) -> ScraperResult: - return await self.strategy.ascrape(url, self.crawler) + async def ascrape(self, url: str, parallel_processing: bool = True) -> ScraperResult: + return await self.strategy.ascrape(url, self.crawler, parallel_processing) async def ascrape_many(self, urls: List[str]) -> List[ScraperResult]: all_results = [] diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index a8fb1fe1..a3bb6750 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -13,6 +13,7 @@ from aiolimiter import AsyncLimiter from tenacity import retry, stop_after_attempt, wait_exponential from collections import defaultdict import logging +from typing import Dict logging.basicConfig(level=logging.DEBUG) rate_limiter = AsyncLimiter(1, 1) # 1 request per second @@ -44,7 +45,7 @@ class BFSScraperStrategy(ScraperStrategy): async def retry_crawl(self, crawler: AsyncWebCrawler, url: str) -> CrawlResult: return await crawler.arun(url) - async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set) -> CrawlResult: + async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set, depths: Dict[str, int]) -> CrawlResult: def normalize_url(url: str) -> str: parsed = urlparse(url) return urlunparse(parsed._replace(fragment="")) @@ -98,34 +99,39 @@ class BFSScraperStrategy(ScraperStrategy): absolute_link = urljoin(url, link['href']) normalized_link = normalize_url(absolute_link) if self.filter_chain.apply(normalized_link) and normalized_link not in visited: - new_depth = depth + 1 + new_depth = depths[url] + 1 if new_depth <= self.max_depth: # URL Scoring score = self.url_scorer.score(normalized_link) await queue.put((score, new_depth, normalized_link)) + depths[normalized_link] = new_depth return crawl_result - async def ascrape(self, start_url: str, crawler: AsyncWebCrawler) -> ScraperResult: + async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> ScraperResult: queue = asyncio.PriorityQueue() queue.put_nowait((0, 0, start_url)) visited = set() - crawled_urls = [] extracted_data = {} + depths = {start_url: 0} while not queue.empty(): tasks = [] while not queue.empty() and len(tasks) < self.max_concurrent: _, depth, url = await queue.get() if url not in visited: - task = asyncio.create_task(self.process_url(url, depth, crawler, queue, visited)) - tasks.append(task) + if parallel_processing: + task = asyncio.create_task(self.process_url(url, depth, crawler, queue, visited, depths)) + tasks.append(task) + else: + result = await self.process_url(url, depth, crawler, queue, visited, depths) + if result: + extracted_data[result.url] = result - if tasks: + if parallel_processing and tasks: results = await asyncio.gather(*tasks) for result in results: if result: - crawled_urls.append(result.url) extracted_data[result.url] = result - return ScraperResult(url=start_url, crawled_urls=crawled_urls, extracted_data=extracted_data) \ No newline at end of file + return ScraperResult(url=start_url, crawled_urls=list(visited), extracted_data=extracted_data) \ No newline at end of file From d743adac68fc4c606283428de3451634c1a5e04f Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Thu, 3 Oct 2024 15:58:57 +0530 Subject: [PATCH 04/28] Fixed some bugs in robots.txt processing --- crawl4ai/scraper/bfs_scraper_strategy.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index a3bb6750..dc89047a 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -32,13 +32,17 @@ class BFSScraperStrategy(ScraperStrategy): # Robots.txt Parser def get_robot_parser(self, url: str) -> RobotFileParser: - domain = urlparse(url).netloc - if domain not in self.robot_parsers: + domain = urlparse(url) + scheme = domain.scheme if domain.scheme else 'http' # Default to 'http' if no scheme provided + netloc = domain.netloc + + if netloc not in self.robot_parsers: rp = RobotFileParser() - rp.set_url(f"https://{domain}/robots.txt") + rp.set_url(f"{scheme}://{netloc}/robots.txt") rp.read() - self.robot_parsers[domain] = rp - return self.robot_parsers[domain] + self.robot_parsers[netloc] = rp + return self.robot_parsers[netloc] + # Retry with exponential backoff @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) @@ -56,7 +60,7 @@ class BFSScraperStrategy(ScraperStrategy): return None # Robots.txt Compliance - if not self.get_robot_parser(url).can_fetch("YourUserAgent", url): + if not self.get_robot_parser(url).can_fetch(crawler.crawler_strategy.user_agent, url): logging.info(f"Skipping {url} as per robots.txt") return None From 8a7d29ce85056a51f03049b37c51d83d5304743c Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Wed, 16 Oct 2024 15:59:37 +0530 Subject: [PATCH 05/28] updated some comments and removed content type checking functionality from core as it's implemented as a filter --- crawl4ai/scraper/bfs_scraper_strategy.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index dc89047a..9022cd90 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -24,18 +24,17 @@ class BFSScraperStrategy(ScraperStrategy): self.filter_chain = filter_chain self.url_scorer = url_scorer self.max_concurrent = max_concurrent - # 9. Crawl Politeness + # For Crawl Politeness self.last_crawl_time = defaultdict(float) self.min_crawl_delay = 1 # 1 second delay between requests to the same domain - # 5. Robots.txt Compliance + # For Robots.txt Compliance self.robot_parsers = {} - + # Robots.txt Parser def get_robot_parser(self, url: str) -> RobotFileParser: domain = urlparse(url) scheme = domain.scheme if domain.scheme else 'http' # Default to 'http' if no scheme provided netloc = domain.netloc - if netloc not in self.robot_parsers: rp = RobotFileParser() rp.set_url(f"{scheme}://{netloc}/robots.txt") @@ -90,11 +89,6 @@ class BFSScraperStrategy(ScraperStrategy): await self.add_to_retry_queue(url) return crawl_result - # Content Type Checking - # if 'text/html' not in crawl_result.response_header.get('Content-Type', ''): - # logging.info(f"Skipping non-HTML content: {url}") - # return crawl_result - visited.add(url) # Process links From 2943feeecf44806a000f5c01502798e52278bce7 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Wed, 16 Oct 2024 22:05:29 +0530 Subject: [PATCH 06/28] 1. Added a flag to yield each crawl result,as they become ready along with the final scraper result as another option 2. Removed ascrape_many method, as I'm currently not focusing on it in the first cut of scraper 3. Added some error handling for cases where robots.txt cannot be fetched or parsed. --- crawl4ai/scraper/async_web_scraper.py | 48 ++++++++++++------------ crawl4ai/scraper/bfs_scraper_strategy.py | 33 +++++++++------- crawl4ai/scraper/scraper_strategy.py | 21 ++++++++++- 3 files changed, 62 insertions(+), 40 deletions(-) diff --git a/crawl4ai/scraper/async_web_scraper.py b/crawl4ai/scraper/async_web_scraper.py index fadfa61f..811aeacc 100644 --- a/crawl4ai/scraper/async_web_scraper.py +++ b/crawl4ai/scraper/async_web_scraper.py @@ -1,35 +1,35 @@ import asyncio from typing import List, Dict from .scraper_strategy import ScraperStrategy -from .bfs_scraper_strategy import BFSScraperStrategy -from .models import ScraperResult +from .models import ScraperResult, CrawlResult from ..async_webcrawler import AsyncWebCrawler - -class BatchProcessor: - def __init__(self, batch_size: int, concurrency_limit: int): - self.batch_size = batch_size - self.concurrency_limit = concurrency_limit - - async def process_batch(self, scraper: 'AsyncWebScraper', urls: List[str]) -> List[ScraperResult]: - semaphore = asyncio.Semaphore(self.concurrency_limit) - async def scrape_with_semaphore(url): - async with semaphore: - return await scraper.ascrape(url) - return await asyncio.gather(*[scrape_with_semaphore(url) for url in urls]) +from typing import Union, AsyncGenerator class AsyncWebScraper: def __init__(self, crawler: AsyncWebCrawler, strategy: ScraperStrategy, batch_size: int = 10, concurrency_limit: int = 5): self.crawler = crawler self.strategy = strategy - self.batch_processor = BatchProcessor(batch_size, concurrency_limit) - async def ascrape(self, url: str, parallel_processing: bool = True) -> ScraperResult: - return await self.strategy.ascrape(url, self.crawler, parallel_processing) + async def ascrape(self, url: str, parallel_processing: bool = True, yield_results: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: + if yield_results: + return self._ascrape_yielding(url, parallel_processing) + else: + return await self._ascrape_collecting(url, parallel_processing) - async def ascrape_many(self, urls: List[str]) -> List[ScraperResult]: - all_results = [] - for i in range(0, len(urls), self.batch_processor.batch_size): - batch = urls[i:i+self.batch_processor.batch_size] - batch_results = await self.batch_processor.process_batch(self, batch) - all_results.extend(batch_results) - return all_results \ No newline at end of file + async def _ascrape_yielding(self, url: str, parallel_processing: bool) -> AsyncGenerator[CrawlResult, None]: + result_generator = self.strategy.ascrape(url, self.crawler, parallel_processing) + async for res in result_generator: # Consume the async generator + yield res # Yielding individual results + + async def _ascrape_collecting(self, url: str, parallel_processing: bool) -> ScraperResult: + extracted_data = {} + result_generator = self.strategy.ascrape(url, self.crawler, parallel_processing) + async for res in result_generator: # Consume the async generator + extracted_data[res.url] = res + + # Return a final ScraperResult + return ScraperResult( + url=url, + crawled_urls=list(extracted_data.keys()), + extracted_data=extracted_data + ) \ No newline at end of file diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index 9022cd90..1146714d 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -1,7 +1,6 @@ from .scraper_strategy import ScraperStrategy from .filters import FilterChain from .scorers import URLScorer -from .models import ScraperResult from ..models import CrawlResult from ..async_webcrawler import AsyncWebCrawler import asyncio @@ -13,7 +12,7 @@ from aiolimiter import AsyncLimiter from tenacity import retry, stop_after_attempt, wait_exponential from collections import defaultdict import logging -from typing import Dict +from typing import Dict, AsyncGenerator logging.basicConfig(level=logging.DEBUG) rate_limiter = AsyncLimiter(1, 1) # 1 request per second @@ -38,7 +37,12 @@ class BFSScraperStrategy(ScraperStrategy): if netloc not in self.robot_parsers: rp = RobotFileParser() rp.set_url(f"{scheme}://{netloc}/robots.txt") - rp.read() + try: + rp.read() + except Exception as e: + # Log the type of error, message, and the URL + logging.warning(f"Error {type(e).__name__} occurred while fetching robots.txt for {netloc}: {e}") + return None self.robot_parsers[netloc] = rp return self.robot_parsers[netloc] @@ -48,7 +52,7 @@ class BFSScraperStrategy(ScraperStrategy): async def retry_crawl(self, crawler: AsyncWebCrawler, url: str) -> CrawlResult: return await crawler.arun(url) - async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set, depths: Dict[str, int]) -> CrawlResult: + async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set, depths: Dict[str, int]) -> AsyncGenerator[CrawlResult, None]: def normalize_url(url: str) -> str: parsed = urlparse(url) return urlunparse(parsed._replace(fragment="")) @@ -59,9 +63,14 @@ class BFSScraperStrategy(ScraperStrategy): return None # Robots.txt Compliance - if not self.get_robot_parser(url).can_fetch(crawler.crawler_strategy.user_agent, url): - logging.info(f"Skipping {url} as per robots.txt") - return None + robot_parser = self.get_robot_parser(url) + if robot_parser is None: + logging.info(f"Could not retrieve robots.txt for {url}, hence proceeding with crawl.") + else: + # If robots.txt was fetched, check if crawling is allowed + if not robot_parser.can_fetch(crawler.crawler_strategy.user_agent, url): + logging.info(f"Skipping {url} as per robots.txt") + return None # Crawl Politeness domain = urlparse(url).netloc @@ -103,14 +112,12 @@ class BFSScraperStrategy(ScraperStrategy): score = self.url_scorer.score(normalized_link) await queue.put((score, new_depth, normalized_link)) depths[normalized_link] = new_depth - return crawl_result - async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> ScraperResult: + async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> CrawlResult: queue = asyncio.PriorityQueue() queue.put_nowait((0, 0, start_url)) visited = set() - extracted_data = {} depths = {start_url: 0} while not queue.empty(): @@ -124,12 +131,10 @@ class BFSScraperStrategy(ScraperStrategy): else: result = await self.process_url(url, depth, crawler, queue, visited, depths) if result: - extracted_data[result.url] = result + yield result if parallel_processing and tasks: results = await asyncio.gather(*tasks) for result in results: if result: - extracted_data[result.url] = result - - return ScraperResult(url=start_url, crawled_urls=list(visited), extracted_data=extracted_data) \ No newline at end of file + yield result \ No newline at end of file diff --git a/crawl4ai/scraper/scraper_strategy.py b/crawl4ai/scraper/scraper_strategy.py index 6d1cdc74..e08a980d 100644 --- a/crawl4ai/scraper/scraper_strategy.py +++ b/crawl4ai/scraper/scraper_strategy.py @@ -1,9 +1,26 @@ from abc import ABC, abstractmethod -from .models import ScraperResult +from .models import ScraperResult, CrawlResult from ..models import CrawlResult from ..async_webcrawler import AsyncWebCrawler +from typing import Union, AsyncGenerator class ScraperStrategy(ABC): @abstractmethod - async def ascrape(self, url: str, crawler: AsyncWebCrawler) -> ScraperResult: + async def ascrape(self, url: str, crawler: AsyncWebCrawler, parallel_processing: bool=True, yield_results: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: + """Scrape the given URL using the specified crawler. + + Args: + url (str): The starting URL for the scrape. + crawler (AsyncWebCrawler): The web crawler instance. + parallel_processing (bool): Whether to use parallel processing. Defaults to True. + yield_results (bool): If True, yields individual crawl results as they are ready; + if False, accumulates results and returns a final ScraperResult. + + Yields: + CrawlResult: Individual crawl results if yield_results is True. + + Returns: + ScraperResult: A summary of the scrape results containing the final extracted data + and the list of crawled URLs if yield_results is False. + """ pass \ No newline at end of file From 04d8b47b927a5b3ba73e156e99292b76631c9c34 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Wed, 16 Oct 2024 22:34:54 +0530 Subject: [PATCH 07/28] Exposed min_crawl_delay for BFSScraperStrategy --- crawl4ai/scraper/bfs_scraper_strategy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index 1146714d..6fc39e73 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -18,14 +18,14 @@ logging.basicConfig(level=logging.DEBUG) rate_limiter = AsyncLimiter(1, 1) # 1 request per second class BFSScraperStrategy(ScraperStrategy): - def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer, max_concurrent: int = 5): + def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer, max_concurrent: int = 5, min_crawl_delay: int=1): self.max_depth = max_depth self.filter_chain = filter_chain self.url_scorer = url_scorer self.max_concurrent = max_concurrent # For Crawl Politeness self.last_crawl_time = defaultdict(float) - self.min_crawl_delay = 1 # 1 second delay between requests to the same domain + self.min_crawl_delay = min_crawl_delay # 1 second delay between requests to the same domain # For Robots.txt Compliance self.robot_parsers = {} From de28b59aca473b3292ecc7f6ab1f60dbd3ed488a Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Wed, 16 Oct 2024 22:36:48 +0530 Subject: [PATCH 08/28] removed unused imports --- crawl4ai/scraper/async_web_scraper.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/crawl4ai/scraper/async_web_scraper.py b/crawl4ai/scraper/async_web_scraper.py index 811aeacc..0d921af5 100644 --- a/crawl4ai/scraper/async_web_scraper.py +++ b/crawl4ai/scraper/async_web_scraper.py @@ -1,5 +1,3 @@ -import asyncio -from typing import List, Dict from .scraper_strategy import ScraperStrategy from .models import ScraperResult, CrawlResult from ..async_webcrawler import AsyncWebCrawler From ce7fce4b1648761b90ad95cc699b2e13abe19be2 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Thu, 17 Oct 2024 12:25:17 +0530 Subject: [PATCH 09/28] 1. Moved to asyncio.wait instead of gather so that results can be yeilded just as they are ready, rather than in batches 2. Moved the visted.add(url), to before the task is put in queue rather than after the crawl is completed. This makes sure that duplicate crawls doesn't happen when same URL is found at different depth and that get's queued too because the crawl is not yet completed and visted set is not updated. 3. Named the yield_results attribute to stream instead. Since that seems to be popularly used in all other AI libraries for intermediate results. --- crawl4ai/scraper/async_web_scraper.py | 6 +++--- crawl4ai/scraper/bfs_scraper_strategy.py | 26 ++++++++++++++---------- crawl4ai/scraper/scraper_strategy.py | 8 ++++---- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/crawl4ai/scraper/async_web_scraper.py b/crawl4ai/scraper/async_web_scraper.py index 0d921af5..2fd919e1 100644 --- a/crawl4ai/scraper/async_web_scraper.py +++ b/crawl4ai/scraper/async_web_scraper.py @@ -4,12 +4,12 @@ from ..async_webcrawler import AsyncWebCrawler from typing import Union, AsyncGenerator class AsyncWebScraper: - def __init__(self, crawler: AsyncWebCrawler, strategy: ScraperStrategy, batch_size: int = 10, concurrency_limit: int = 5): + def __init__(self, crawler: AsyncWebCrawler, strategy: ScraperStrategy): self.crawler = crawler self.strategy = strategy - async def ascrape(self, url: str, parallel_processing: bool = True, yield_results: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: - if yield_results: + async def ascrape(self, url: str, parallel_processing: bool = True, stream: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: + if stream: return self._ascrape_yielding(url, parallel_processing) else: return await self._ascrape_collecting(url, parallel_processing) diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index 6fc39e73..b6cdaa80 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -71,7 +71,7 @@ class BFSScraperStrategy(ScraperStrategy): if not robot_parser.can_fetch(crawler.crawler_strategy.user_agent, url): logging.info(f"Skipping {url} as per robots.txt") return None - + # Crawl Politeness domain = urlparse(url).netloc time_since_last_crawl = time.time() - self.last_crawl_time[domain] @@ -97,8 +97,6 @@ class BFSScraperStrategy(ScraperStrategy): elif crawl_result.status_code == 503: await self.add_to_retry_queue(url) return crawl_result - - visited.add(url) # Process links for link_type in ["internal", "external"]: @@ -114,27 +112,33 @@ class BFSScraperStrategy(ScraperStrategy): depths[normalized_link] = new_depth return crawl_result - async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> CrawlResult: + async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> AsyncGenerator[CrawlResult,None]: queue = asyncio.PriorityQueue() queue.put_nowait((0, 0, start_url)) visited = set() depths = {start_url: 0} + pending_tasks = set() - while not queue.empty(): - tasks = [] - while not queue.empty() and len(tasks) < self.max_concurrent: + while not queue.empty() or pending_tasks: + while not queue.empty() and len(pending_tasks) < self.max_concurrent: _, depth, url = await queue.get() if url not in visited: + # Adding URL to the visited set here itself, (instead of after result generation) + # so that other tasks are not queued for same URL, found at different depth before + # crawling and extraction of this task is completed. + visited.add(url) if parallel_processing: task = asyncio.create_task(self.process_url(url, depth, crawler, queue, visited, depths)) - tasks.append(task) + pending_tasks.add(task) else: result = await self.process_url(url, depth, crawler, queue, visited, depths) if result: yield result - if parallel_processing and tasks: - results = await asyncio.gather(*tasks) - for result in results: + # Wait for the first task to complete and yield results incrementally as each task is completed + if pending_tasks: + done, pending_tasks = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED) + for task in done: + result = await task if result: yield result \ No newline at end of file diff --git a/crawl4ai/scraper/scraper_strategy.py b/crawl4ai/scraper/scraper_strategy.py index e08a980d..e4872de7 100644 --- a/crawl4ai/scraper/scraper_strategy.py +++ b/crawl4ai/scraper/scraper_strategy.py @@ -6,21 +6,21 @@ from typing import Union, AsyncGenerator class ScraperStrategy(ABC): @abstractmethod - async def ascrape(self, url: str, crawler: AsyncWebCrawler, parallel_processing: bool=True, yield_results: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: + async def ascrape(self, url: str, crawler: AsyncWebCrawler, parallel_processing: bool = True, stream: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: """Scrape the given URL using the specified crawler. Args: url (str): The starting URL for the scrape. crawler (AsyncWebCrawler): The web crawler instance. parallel_processing (bool): Whether to use parallel processing. Defaults to True. - yield_results (bool): If True, yields individual crawl results as they are ready; + stream (bool): If True, yields individual crawl results as they are ready; if False, accumulates results and returns a final ScraperResult. Yields: - CrawlResult: Individual crawl results if yield_results is True. + CrawlResult: Individual crawl results if stream is True. Returns: ScraperResult: A summary of the scrape results containing the final extracted data - and the list of crawled URLs if yield_results is False. + and the list of crawled URLs if stream is False. """ pass \ No newline at end of file From 8105fd178e1b7b00a4628e2227953fbe418af5c4 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Thu, 17 Oct 2024 15:42:43 +0530 Subject: [PATCH 10/28] Removed stubs for remove_from_future_crawls since the visited set is updated soon as the URL was queued, Removed add_to_retry_queue(url) since retry with exponential backoff with help of tenacity is going to take care of it. --- crawl4ai/scraper/bfs_scraper_strategy.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index b6cdaa80..ce4d0127 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -91,11 +91,6 @@ class BFSScraperStrategy(ScraperStrategy): if not crawl_result.success: # Logging and Monitoring logging.error(f"Failed to crawl URL: {url}. Error: {crawl_result.error_message}") - # Error Categorization - if crawl_result.status_code == 404: - self.remove_from_future_crawls(url) - elif crawl_result.status_code == 503: - await self.add_to_retry_queue(url) return crawl_result # Process links From 06b21dcc501bfdfca3450dd804bdee202400482b Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 6 Nov 2024 18:44:03 +0800 Subject: [PATCH 11/28] Update .gitignore to include new directories for issues and documentation --- .gitignore | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 8b8f014c..4c3e151e 100644 --- a/.gitignore +++ b/.gitignore @@ -202,5 +202,10 @@ todo.md git_changes.py git_changes.md pypi_build.sh +git_issues.py +git_issues.md -.tests/ \ No newline at end of file +.tests/ +.issues/ +.docs/ +.issues/ \ No newline at end of file From be472c624c625b5f240705112036fd5ef6f1eb8f Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 6 Nov 2024 21:09:47 +0800 Subject: [PATCH 12/28] Refactored AsyncWebScraper to include comprehensive error handling and progress tracking capabilities. Introduced a ScrapingProgress data class to monitor processed and failed URLs. Enhanced scraping methods to log errors and track stats throughout the scraping process. --- crawl4ai/scraper/async_web_scraper.py | 132 ++++++++++++++++---- docs/scrapper/async_web_scraper.md | 166 ++++++++++++++++++++++++++ 2 files changed, 277 insertions(+), 21 deletions(-) create mode 100644 docs/scrapper/async_web_scraper.md diff --git a/crawl4ai/scraper/async_web_scraper.py b/crawl4ai/scraper/async_web_scraper.py index 2fd919e1..45a35306 100644 --- a/crawl4ai/scraper/async_web_scraper.py +++ b/crawl4ai/scraper/async_web_scraper.py @@ -1,33 +1,123 @@ +from typing import Union, AsyncGenerator, Optional from .scraper_strategy import ScraperStrategy from .models import ScraperResult, CrawlResult from ..async_webcrawler import AsyncWebCrawler -from typing import Union, AsyncGenerator +import logging +from dataclasses import dataclass +from contextlib import asynccontextmanager + +@dataclass +class ScrapingProgress: + """Tracks the progress of a scraping operation.""" + processed_urls: int = 0 + failed_urls: int = 0 + current_url: Optional[str] = None class AsyncWebScraper: - def __init__(self, crawler: AsyncWebCrawler, strategy: ScraperStrategy): + """ + A high-level web scraper that combines an async crawler with a scraping strategy. + + Args: + crawler (AsyncWebCrawler): The async web crawler implementation + strategy (ScraperStrategy): The scraping strategy to use + logger (Optional[logging.Logger]): Custom logger for the scraper + """ + + def __init__( + self, + crawler: AsyncWebCrawler, + strategy: ScraperStrategy, + logger: Optional[logging.Logger] = None + ): + if not isinstance(crawler, AsyncWebCrawler): + raise TypeError("crawler must be an instance of AsyncWebCrawler") + if not isinstance(strategy, ScraperStrategy): + raise TypeError("strategy must be an instance of ScraperStrategy") + self.crawler = crawler self.strategy = strategy + self.logger = logger or logging.getLogger(__name__) + self._progress = ScrapingProgress() - async def ascrape(self, url: str, parallel_processing: bool = True, stream: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: - if stream: - return self._ascrape_yielding(url, parallel_processing) - else: + @property + def progress(self) -> ScrapingProgress: + """Get current scraping progress.""" + return self._progress + + @asynccontextmanager + async def _error_handling_context(self, url: str): + """Context manager for handling errors during scraping.""" + try: + yield + except Exception as e: + self.logger.error(f"Error scraping {url}: {str(e)}") + self._progress.failed_urls += 1 + raise + + async def ascrape( + self, + url: str, + parallel_processing: bool = True, + stream: bool = False + ) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: + """ + Scrape a website starting from the given URL. + + Args: + url: Starting URL for scraping + parallel_processing: Whether to process URLs in parallel + stream: If True, yield results as they come; if False, collect all results + + Returns: + Either an async generator yielding CrawlResults or a final ScraperResult + """ + self._progress = ScrapingProgress() # Reset progress + + async with self._error_handling_context(url): + if stream: + return self._ascrape_yielding(url, parallel_processing) return await self._ascrape_collecting(url, parallel_processing) - async def _ascrape_yielding(self, url: str, parallel_processing: bool) -> AsyncGenerator[CrawlResult, None]: - result_generator = self.strategy.ascrape(url, self.crawler, parallel_processing) - async for res in result_generator: # Consume the async generator - yield res # Yielding individual results + async def _ascrape_yielding( + self, + url: str, + parallel_processing: bool + ) -> AsyncGenerator[CrawlResult, None]: + """Stream scraping results as they become available.""" + try: + result_generator = self.strategy.ascrape(url, self.crawler, parallel_processing) + async for res in result_generator: + self._progress.processed_urls += 1 + self._progress.current_url = res.url + yield res + except Exception as e: + self.logger.error(f"Error in streaming scrape: {str(e)}") + raise - async def _ascrape_collecting(self, url: str, parallel_processing: bool) -> ScraperResult: + async def _ascrape_collecting( + self, + url: str, + parallel_processing: bool + ) -> ScraperResult: + """Collect all scraping results before returning.""" extracted_data = {} - result_generator = self.strategy.ascrape(url, self.crawler, parallel_processing) - async for res in result_generator: # Consume the async generator - extracted_data[res.url] = res - - # Return a final ScraperResult - return ScraperResult( - url=url, - crawled_urls=list(extracted_data.keys()), - extracted_data=extracted_data - ) \ No newline at end of file + + try: + result_generator = self.strategy.ascrape(url, self.crawler, parallel_processing) + async for res in result_generator: + self._progress.processed_urls += 1 + self._progress.current_url = res.url + extracted_data[res.url] = res + + return ScraperResult( + url=url, + crawled_urls=list(extracted_data.keys()), + extracted_data=extracted_data, + stats={ + 'processed_urls': self._progress.processed_urls, + 'failed_urls': self._progress.failed_urls + } + ) + except Exception as e: + self.logger.error(f"Error in collecting scrape: {str(e)}") + raise \ No newline at end of file diff --git a/docs/scrapper/async_web_scraper.md b/docs/scrapper/async_web_scraper.md new file mode 100644 index 00000000..ca5f749f --- /dev/null +++ b/docs/scrapper/async_web_scraper.md @@ -0,0 +1,166 @@ +# AsyncWebScraper: Smart Web Crawling Made Easy + +AsyncWebScraper is a powerful and flexible web scraping tool that makes it easy to collect data from websites efficiently. Whether you need to scrape a few pages or an entire website, AsyncWebScraper handles the complexity of web crawling while giving you fine-grained control over the process. + +## How It Works + +```mermaid +flowchart TB + Start([Start]) --> Init[Initialize AsyncWebScraper\nwith Crawler and Strategy] + Init --> InputURL[Receive URL to scrape] + InputURL --> Decision{Stream or\nCollect?} + + %% Streaming Path + Decision -->|Stream| StreamInit[Initialize Streaming Mode] + StreamInit --> StreamStrategy[Call Strategy.ascrape] + StreamStrategy --> AsyncGen[Create Async Generator] + AsyncGen --> ProcessURL[Process Next URL] + ProcessURL --> FetchContent[Fetch Page Content] + FetchContent --> Extract[Extract Data] + Extract --> YieldResult[Yield CrawlResult] + YieldResult --> CheckMore{More URLs?} + CheckMore -->|Yes| ProcessURL + CheckMore -->|No| StreamEnd([End Stream]) + + %% Collecting Path + Decision -->|Collect| CollectInit[Initialize Collection Mode] + CollectInit --> CollectStrategy[Call Strategy.ascrape] + CollectStrategy --> CollectGen[Create Async Generator] + CollectGen --> ProcessURLColl[Process Next URL] + ProcessURLColl --> FetchContentColl[Fetch Page Content] + FetchContentColl --> ExtractColl[Extract Data] + ExtractColl --> StoreColl[Store in Dictionary] + StoreColl --> CheckMoreColl{More URLs?} + CheckMoreColl -->|Yes| ProcessURLColl + CheckMoreColl -->|No| CreateResult[Create ScraperResult] + CreateResult --> ReturnResult([Return Result]) + + %% Parallel Processing + subgraph Parallel + ProcessURL + FetchContent + Extract + ProcessURLColl + FetchContentColl + ExtractColl + end + + %% Error Handling + FetchContent --> ErrorCheck{Error?} + ErrorCheck -->|Yes| LogError[Log Error] + LogError --> UpdateStats[Update Error Stats] + UpdateStats --> CheckMore + ErrorCheck -->|No| Extract + + FetchContentColl --> ErrorCheckColl{Error?} + ErrorCheckColl -->|Yes| LogErrorColl[Log Error] + LogErrorColl --> UpdateStatsColl[Update Error Stats] + UpdateStatsColl --> CheckMoreColl + ErrorCheckColl -->|No| ExtractColl + + %% Style definitions + classDef process fill:#90caf9,stroke:#000,stroke-width:2px; + classDef decision fill:#fff59d,stroke:#000,stroke-width:2px; + classDef error fill:#ef9a9a,stroke:#000,stroke-width:2px; + classDef start fill:#a5d6a7,stroke:#000,stroke-width:2px; + + class Start,StreamEnd,ReturnResult start; + class Decision,CheckMore,CheckMoreColl,ErrorCheck,ErrorCheckColl decision; + class LogError,LogErrorColl,UpdateStats,UpdateStatsColl error; + class ProcessURL,FetchContent,Extract,ProcessURLColl,FetchContentColl,ExtractColl process; +``` + +AsyncWebScraper uses an intelligent crawling system that can navigate through websites following your specified strategy. It supports two main modes of operation: + +### 1. Streaming Mode +```python +async for result in scraper.ascrape(url, stream=True): + print(f"Found data on {result.url}") + process_data(result.data) +``` +- Perfect for processing large websites +- Memory efficient - handles one page at a time +- Ideal for real-time data processing +- Great for monitoring or continuous scraping tasks + +### 2. Collection Mode +```python +result = await scraper.ascrape(url) +print(f"Scraped {len(result.crawled_urls)} pages") +process_all_data(result.extracted_data) +``` +- Collects all data before returning +- Best for when you need the complete dataset +- Easier to work with for batch processing +- Includes comprehensive statistics + +## Key Features + +- **Smart Crawling**: Automatically follows relevant links while avoiding duplicates +- **Parallel Processing**: Scrapes multiple pages simultaneously for better performance +- **Memory Efficient**: Choose between streaming and collecting based on your needs +- **Error Resilient**: Continues working even if some pages fail to load +- **Progress Tracking**: Monitor the scraping progress in real-time +- **Customizable**: Configure crawling strategy, filters, and scoring to match your needs + +## Quick Start + +```python +from crawl4ai.scraper import AsyncWebScraper, BFSStrategy +from crawl4ai.async_webcrawler import AsyncWebCrawler + +# Initialize the scraper +crawler = AsyncWebCrawler() +strategy = BFSStrategy( + max_depth=2, # How deep to crawl + url_pattern="*.example.com/*" # What URLs to follow +) +scraper = AsyncWebScraper(crawler, strategy) + +# Start scraping +async def main(): + # Collect all results + result = await scraper.ascrape("https://example.com") + print(f"Found {len(result.extracted_data)} pages") + + # Or stream results + async for page in scraper.ascrape("https://example.com", stream=True): + print(f"Processing {page.url}") + +``` + +## Best Practices + +1. **Choose the Right Mode** + - Use streaming for large websites or real-time processing + - Use collecting for smaller sites or when you need the complete dataset + +2. **Configure Depth** + - Start with a small depth (2-3) and increase if needed + - Higher depths mean exponentially more pages to crawl + +3. **Set Appropriate Filters** + - Use URL patterns to stay within relevant sections + - Set content type filters to only process useful pages + +4. **Handle Resources Responsibly** + - Enable parallel processing for faster results + - Consider the target website's capacity + - Implement appropriate delays between requests + +## Common Use Cases + +- **Content Aggregation**: Collect articles, blog posts, or news from multiple pages +- **Data Extraction**: Gather product information, prices, or specifications +- **Site Mapping**: Create a complete map of a website's structure +- **Content Monitoring**: Track changes or updates across multiple pages +- **Data Mining**: Extract and analyze patterns across web pages + +## Advanced Features + +- Custom scoring algorithms for prioritizing important pages +- URL filters for focusing on specific site sections +- Content type filtering for processing only relevant pages +- Progress tracking for monitoring long-running scrapes + +Need more help? Check out our [examples repository](https://github.com/example/crawl4ai/examples) or join our [community Discord](https://discord.gg/example). \ No newline at end of file From 3d1c9a84349ad4b7507ba257760071d4177c205a Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 7 Nov 2024 18:54:53 +0800 Subject: [PATCH 13/28] Revieweing the BFS strategy. --- crawl4ai/scraper/bfs_scraper_strategy copy.py | 138 +++++++ crawl4ai/scraper/bfs_scraper_strategy.py | 341 ++++++++++++------ 2 files changed, 368 insertions(+), 111 deletions(-) create mode 100644 crawl4ai/scraper/bfs_scraper_strategy copy.py diff --git a/crawl4ai/scraper/bfs_scraper_strategy copy.py b/crawl4ai/scraper/bfs_scraper_strategy copy.py new file mode 100644 index 00000000..51bf9cb3 --- /dev/null +++ b/crawl4ai/scraper/bfs_scraper_strategy copy.py @@ -0,0 +1,138 @@ +from .scraper_strategy import ScraperStrategy +from .filters import FilterChain +from .scorers import URLScorer +from ..models import CrawlResult +from ..async_webcrawler import AsyncWebCrawler +import asyncio +import validators +from urllib.parse import urljoin,urlparse,urlunparse +from urllib.robotparser import RobotFileParser +import time +from aiolimiter import AsyncLimiter +from tenacity import retry, stop_after_attempt, wait_exponential +from collections import defaultdict +import logging +from typing import Dict, AsyncGenerator +logging.basicConfig(level=logging.DEBUG) + +rate_limiter = AsyncLimiter(1, 1) # 1 request per second + +class BFSScraperStrategy(ScraperStrategy): + def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer, max_concurrent: int = 5, min_crawl_delay: int=1): + self.max_depth = max_depth + self.filter_chain = filter_chain + self.url_scorer = url_scorer + self.max_concurrent = max_concurrent + # For Crawl Politeness + self.last_crawl_time = defaultdict(float) + self.min_crawl_delay = min_crawl_delay # 1 second delay between requests to the same domain + # For Robots.txt Compliance + self.robot_parsers = {} + + # Robots.txt Parser + def get_robot_parser(self, url: str) -> RobotFileParser: + domain = urlparse(url) + scheme = domain.scheme if domain.scheme else 'http' # Default to 'http' if no scheme provided + netloc = domain.netloc + if netloc not in self.robot_parsers: + rp = RobotFileParser() + rp.set_url(f"{scheme}://{netloc}/robots.txt") + try: + rp.read() + except Exception as e: + # Log the type of error, message, and the URL + logging.warning(f"Error {type(e).__name__} occurred while fetching robots.txt for {netloc}: {e}") + return None + self.robot_parsers[netloc] = rp + return self.robot_parsers[netloc] + + # Retry with exponential backoff + @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) + async def retry_crawl(self, crawler: AsyncWebCrawler, url: str) -> CrawlResult: + return await crawler.arun(url) + + async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set, depths: Dict[str, int]) -> AsyncGenerator[CrawlResult, None]: + def normalize_url(url: str) -> str: + parsed = urlparse(url) + return urlunparse(parsed._replace(fragment="")) + + # URL Validation + if not validators.url(url): + logging.warning(f"Invalid URL: {url}") + return None + + # Robots.txt Compliance + robot_parser = self.get_robot_parser(url) + if robot_parser is None: + logging.info(f"Could not retrieve robots.txt for {url}, hence proceeding with crawl.") + else: + # If robots.txt was fetched, check if crawling is allowed + if not robot_parser.can_fetch(crawler.crawler_strategy.user_agent, url): + logging.info(f"Skipping {url} as per robots.txt") + return None + + # Crawl Politeness + domain = urlparse(url).netloc + time_since_last_crawl = time.time() - self.last_crawl_time[domain] + if time_since_last_crawl < self.min_crawl_delay: + await asyncio.sleep(self.min_crawl_delay - time_since_last_crawl) + self.last_crawl_time[domain] = time.time() + + # Rate Limiting + async with rate_limiter: + # Error Handling + try: + crawl_result = await self.retry_crawl(crawler, url) + except Exception as e: + logging.error(f"Error crawling {url}: {str(e)}") + crawl_result = CrawlResult(url=url, html="", success=False, status_code=0, error_message=str(e)) + + if not crawl_result.success: + # Logging and Monitoring + logging.error(f"Failed to crawl URL: {url}. Error: {crawl_result.error_message}") + return crawl_result + + # Process links + for link_type in ["internal", "external"]: + for link in crawl_result.links[link_type]: + absolute_link = urljoin(url, link['href']) + normalized_link = normalize_url(absolute_link) + if self.filter_chain.apply(normalized_link) and normalized_link not in visited: + new_depth = depths[url] + 1 + if new_depth <= self.max_depth: + # URL Scoring + score = self.url_scorer.score(normalized_link) + await queue.put((score, new_depth, normalized_link)) + depths[normalized_link] = new_depth + return crawl_result + + async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> AsyncGenerator[CrawlResult,None]: + queue = asyncio.PriorityQueue() + queue.put_nowait((0, 0, start_url)) + visited = set() + depths = {start_url: 0} + pending_tasks = set() + + while not queue.empty() or pending_tasks: + while not queue.empty() and len(pending_tasks) < self.max_concurrent: + _, depth, url = await queue.get() + if url not in visited: + # Adding URL to the visited set here itself, (instead of after result generation) + # so that other tasks are not queued for same URL, found at different depth before + # crawling and extraction of this task is completed. + visited.add(url) + if parallel_processing: + task = asyncio.create_task(self.process_url(url, depth, crawler, queue, visited, depths)) + pending_tasks.add(task) + else: + result = await self.process_url(url, depth, crawler, queue, visited, depths) + if result: + yield result + + # Wait for the first task to complete and yield results incrementally as each task is completed + if pending_tasks: + done, pending_tasks = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED) + for task in done: + result = await task + if result: + yield result \ No newline at end of file diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index ce4d0127..4506dbfe 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -1,139 +1,258 @@ -from .scraper_strategy import ScraperStrategy -from .filters import FilterChain -from .scorers import URLScorer -from ..models import CrawlResult -from ..async_webcrawler import AsyncWebCrawler +from abc import ABC, abstractmethod +from typing import Union, AsyncGenerator, Optional, Dict, Set +from dataclasses import dataclass +from datetime import datetime import asyncio -import validators -from urllib.parse import urljoin,urlparse,urlunparse +import logging +from urllib.parse import urljoin, urlparse, urlunparse from urllib.robotparser import RobotFileParser +import validators import time from aiolimiter import AsyncLimiter from tenacity import retry, stop_after_attempt, wait_exponential from collections import defaultdict -import logging -from typing import Dict, AsyncGenerator -logging.basicConfig(level=logging.DEBUG) -rate_limiter = AsyncLimiter(1, 1) # 1 request per second +from .models import ScraperResult, CrawlResult +from .filters import FilterChain +from .scorers import URLScorer +from ..async_webcrawler import AsyncWebCrawler + +@dataclass +class CrawlStats: + """Statistics for the crawling process""" + start_time: datetime + urls_processed: int = 0 + urls_failed: int = 0 + urls_skipped: int = 0 + total_depth_reached: int = 0 + current_depth: int = 0 + robots_blocked: int = 0 + +class ScraperStrategy(ABC): + """Base class for scraping strategies""" + + @abstractmethod + async def ascrape( + self, + url: str, + crawler: AsyncWebCrawler, + parallel_processing: bool = True, + stream: bool = False + ) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: + """Abstract method for scraping implementation""" + pass + + @abstractmethod + async def can_process_url(self, url: str) -> bool: + """Check if URL can be processed based on strategy rules""" + pass + + @abstractmethod + async def shutdown(self): + """Clean up resources used by the strategy""" + pass class BFSScraperStrategy(ScraperStrategy): - def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer, max_concurrent: int = 5, min_crawl_delay: int=1): + """Breadth-First Search scraping strategy with politeness controls""" + + def __init__( + self, + max_depth: int, + filter_chain: FilterChain, + url_scorer: URLScorer, + max_concurrent: int = 5, + min_crawl_delay: int = 1, + timeout: int = 30, + logger: Optional[logging.Logger] = None + ): self.max_depth = max_depth self.filter_chain = filter_chain self.url_scorer = url_scorer self.max_concurrent = max_concurrent - # For Crawl Politeness + self.min_crawl_delay = min_crawl_delay + self.timeout = timeout + self.logger = logger or logging.getLogger(__name__) + + # Crawl control + self.stats = CrawlStats(start_time=datetime.now()) + self._cancel_event = asyncio.Event() + + # Rate limiting and politeness + self.rate_limiter = AsyncLimiter(1, 1) self.last_crawl_time = defaultdict(float) - self.min_crawl_delay = min_crawl_delay # 1 second delay between requests to the same domain - # For Robots.txt Compliance - self.robot_parsers = {} + self.robot_parsers: Dict[str, RobotFileParser] = {} + self.domain_queues: Dict[str, asyncio.Queue] = defaultdict(asyncio.Queue) - # Robots.txt Parser - def get_robot_parser(self, url: str) -> RobotFileParser: - domain = urlparse(url) - scheme = domain.scheme if domain.scheme else 'http' # Default to 'http' if no scheme provided - netloc = domain.netloc - if netloc not in self.robot_parsers: - rp = RobotFileParser() - rp.set_url(f"{scheme}://{netloc}/robots.txt") - try: - rp.read() - except Exception as e: - # Log the type of error, message, and the URL - logging.warning(f"Error {type(e).__name__} occurred while fetching robots.txt for {netloc}: {e}") - return None - self.robot_parsers[netloc] = rp - return self.robot_parsers[netloc] - - - # Retry with exponential backoff - @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) - async def retry_crawl(self, crawler: AsyncWebCrawler, url: str) -> CrawlResult: - return await crawler.arun(url) - - async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set, depths: Dict[str, int]) -> AsyncGenerator[CrawlResult, None]: - def normalize_url(url: str) -> str: - parsed = urlparse(url) - return urlunparse(parsed._replace(fragment="")) - - # URL Validation + async def can_process_url(self, url: str) -> bool: + """Check if URL can be processed based on robots.txt and filters""" if not validators.url(url): - logging.warning(f"Invalid URL: {url}") - return None - - # Robots.txt Compliance - robot_parser = self.get_robot_parser(url) - if robot_parser is None: - logging.info(f"Could not retrieve robots.txt for {url}, hence proceeding with crawl.") - else: - # If robots.txt was fetched, check if crawling is allowed - if not robot_parser.can_fetch(crawler.crawler_strategy.user_agent, url): - logging.info(f"Skipping {url} as per robots.txt") - return None - - # Crawl Politeness + self.logger.warning(f"Invalid URL: {url}") + return False + + robot_parser = await self._get_robot_parser(url) + if robot_parser and not robot_parser.can_fetch("*", url): + self.stats.robots_blocked += 1 + self.logger.info(f"Blocked by robots.txt: {url}") + return False + + return self.filter_chain.apply(url) + + async def _get_robot_parser(self, url: str) -> Optional[RobotFileParser]: + """Get or create robots.txt parser for domain""" domain = urlparse(url).netloc - time_since_last_crawl = time.time() - self.last_crawl_time[domain] - if time_since_last_crawl < self.min_crawl_delay: - await asyncio.sleep(self.min_crawl_delay - time_since_last_crawl) + if domain not in self.robot_parsers: + parser = RobotFileParser() + try: + robots_url = f"{urlparse(url).scheme}://{domain}/robots.txt" + parser.set_url(robots_url) + parser.read() + self.robot_parsers[domain] = parser + except Exception as e: + self.logger.warning(f"Error fetching robots.txt for {domain}: {e}") + return None + return self.robot_parsers[domain] + + @retry(stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=4, max=10)) + async def _crawl_with_retry( + self, + crawler: AsyncWebCrawler, + url: str + ) -> CrawlResult: + """Crawl URL with retry logic""" + try: + async with asyncio.timeout(self.timeout): + return await crawler.arun(url) + except asyncio.TimeoutError: + self.logger.error(f"Timeout crawling {url}") + raise + + async def process_url( + self, + url: str, + depth: int, + crawler: AsyncWebCrawler, + queue: asyncio.PriorityQueue, + visited: Set[str], + depths: Dict[str, int] + ) -> Optional[CrawlResult]: + """Process a single URL and extract links""" + + if self._cancel_event.is_set(): + return None + + if not await self.can_process_url(url): + self.stats.urls_skipped += 1 + return None + + # Politeness delay + domain = urlparse(url).netloc + time_since_last = time.time() - self.last_crawl_time[domain] + if time_since_last < self.min_crawl_delay: + await asyncio.sleep(self.min_crawl_delay - time_since_last) self.last_crawl_time[domain] = time.time() - # Rate Limiting - async with rate_limiter: - # Error Handling - try: - crawl_result = await self.retry_crawl(crawler, url) - except Exception as e: - logging.error(f"Error crawling {url}: {str(e)}") - crawl_result = CrawlResult(url=url, html="", success=False, status_code=0, error_message=str(e)) - - if not crawl_result.success: - # Logging and Monitoring - logging.error(f"Failed to crawl URL: {url}. Error: {crawl_result.error_message}") - return crawl_result + # Crawl with rate limiting + try: + async with self.rate_limiter: + result = await self._crawl_with_retry(crawler, url) + self.stats.urls_processed += 1 + except Exception as e: + self.logger.error(f"Error crawling {url}: {e}") + self.stats.urls_failed += 1 + return None # Process links - for link_type in ["internal", "external"]: - for link in crawl_result.links[link_type]: - absolute_link = urljoin(url, link['href']) - normalized_link = normalize_url(absolute_link) - if self.filter_chain.apply(normalized_link) and normalized_link not in visited: - new_depth = depths[url] + 1 - if new_depth <= self.max_depth: - # URL Scoring - score = self.url_scorer.score(normalized_link) - await queue.put((score, new_depth, normalized_link)) - depths[normalized_link] = new_depth - return crawl_result + await self._process_links(result, url, depth, queue, visited, depths) + + return result - async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> AsyncGenerator[CrawlResult,None]: + async def _process_links( + self, + result: CrawlResult, + source_url: str, + depth: int, + queue: asyncio.PriorityQueue, + visited: Set[str], + depths: Dict[str, int] + ): + """Process extracted links from crawl result""" + for link_type in ["internal", "external"]: + for link in result.links[link_type]: + url = urljoin(source_url, link['href']) + url = urlunparse(urlparse(url)._replace(fragment="")) + + if url not in visited and await self.can_process_url(url): + new_depth = depths[source_url] + 1 + if new_depth <= self.max_depth: + score = self.url_scorer.score(url) + await queue.put((score, new_depth, url)) + depths[url] = new_depth + self.stats.total_depth_reached = max( + self.stats.total_depth_reached, + new_depth + ) + + async def ascrape( + self, + start_url: str, + crawler: AsyncWebCrawler, + parallel_processing: bool = True + ) -> AsyncGenerator[CrawlResult, None]: + """Implement BFS crawling strategy""" + + # Initialize crawl state queue = asyncio.PriorityQueue() - queue.put_nowait((0, 0, start_url)) - visited = set() + await queue.put((0, 0, start_url)) + visited: Set[str] = set() depths = {start_url: 0} pending_tasks = set() + + try: + while (not queue.empty() or pending_tasks) and not self._cancel_event.is_set(): + # Start new tasks up to max_concurrent + while not queue.empty() and len(pending_tasks) < self.max_concurrent: + _, depth, url = await queue.get() + if url not in visited: + visited.add(url) + self.stats.current_depth = depth + + if parallel_processing: + task = asyncio.create_task( + self.process_url(url, depth, crawler, queue, visited, depths) + ) + pending_tasks.add(task) + else: + result = await self.process_url( + url, depth, crawler, queue, visited, depths + ) + if result: + yield result - while not queue.empty() or pending_tasks: - while not queue.empty() and len(pending_tasks) < self.max_concurrent: - _, depth, url = await queue.get() - if url not in visited: - # Adding URL to the visited set here itself, (instead of after result generation) - # so that other tasks are not queued for same URL, found at different depth before - # crawling and extraction of this task is completed. - visited.add(url) - if parallel_processing: - task = asyncio.create_task(self.process_url(url, depth, crawler, queue, visited, depths)) - pending_tasks.add(task) - else: - result = await self.process_url(url, depth, crawler, queue, visited, depths) + # Process completed tasks + if pending_tasks: + done, pending_tasks = await asyncio.wait( + pending_tasks, + return_when=asyncio.FIRST_COMPLETED + ) + for task in done: + result = await task if result: - yield result + yield result + + except Exception as e: + self.logger.error(f"Error in crawl process: {e}") + raise + + finally: + # Clean up any remaining tasks + for task in pending_tasks: + task.cancel() + self.stats.end_time = datetime.now() - # Wait for the first task to complete and yield results incrementally as each task is completed - if pending_tasks: - done, pending_tasks = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED) - for task in done: - result = await task - if result: - yield result \ No newline at end of file + async def shutdown(self): + """Clean up resources and stop crawling""" + self._cancel_event.set() + # Clear caches and close connections + self.robot_parsers.clear() + self.domain_queues.clear() \ No newline at end of file From d11c004fbb7b0719611853654eab9080e3836802 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 8 Nov 2024 15:57:23 +0800 Subject: [PATCH 14/28] Enhanced BFS Strategy: Improved monitoring, resource management & configuration - Added CrawlStats for comprehensive crawl monitoring - Implemented proper resource cleanup with shutdown mechanism - Enhanced URL processing with better validation and politeness controls - Added configuration options (max_concurrent, timeout, external_links) - Improved error handling with retry logic - Added domain-specific queues for better performance - Created comprehensive documentation Note: URL normalization needs review - potential duplicate processing with core crawler for internal links. Currently commented out pending further investigation of edge cases. --- crawl4ai/scraper/bfs_scraper_strategy.py | 83 +++++++- docs/scrapper/bfs_scraper_strategy.md | 244 +++++++++++++++++++++++ 2 files changed, 320 insertions(+), 7 deletions(-) create mode 100644 docs/scrapper/bfs_scraper_strategy.md diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index 4506dbfe..72935008 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -76,6 +76,7 @@ class BFSScraperStrategy(ScraperStrategy): # Crawl control self.stats = CrawlStats(start_time=datetime.now()) self._cancel_event = asyncio.Event() + self.process_external_links = False # Rate limiting and politeness self.rate_limiter = AsyncLimiter(1, 1) @@ -84,7 +85,14 @@ class BFSScraperStrategy(ScraperStrategy): self.domain_queues: Dict[str, asyncio.Queue] = defaultdict(asyncio.Queue) async def can_process_url(self, url: str) -> bool: - """Check if URL can be processed based on robots.txt and filters""" + """Check if URL can be processed based on robots.txt and filters + This is our gatekeeper method that determines if a URL should be processed. It: + - Validates URL format using the validators library + - Checks robots.txt permissions for the domain + - Applies custom filters from the filter chain + - Updates statistics for blocked URLs + - Returns False early if any check fails + """ if not validators.url(url): self.logger.warning(f"Invalid URL: {url}") return False @@ -98,7 +106,13 @@ class BFSScraperStrategy(ScraperStrategy): return self.filter_chain.apply(url) async def _get_robot_parser(self, url: str) -> Optional[RobotFileParser]: - """Get or create robots.txt parser for domain""" + """Get or create robots.txt parser for domain. + This is our robots.txt manager that: + - Uses domain-level caching of robot parsers + - Creates and caches new parsers as needed + - Handles failed robots.txt fetches gracefully + - Returns None if robots.txt can't be fetched, allowing crawling to proceed + """ domain = urlparse(url).netloc if domain not in self.robot_parsers: parser = RobotFileParser() @@ -136,7 +150,17 @@ class BFSScraperStrategy(ScraperStrategy): visited: Set[str], depths: Dict[str, int] ) -> Optional[CrawlResult]: - """Process a single URL and extract links""" + """Process a single URL and extract links. + This is our main URL processing workhorse that: + - Checks for cancellation + - Validates URLs through can_process_url + - Implements politeness delays per domain + - Applies rate limiting + - Handles crawling with retries + - Updates various statistics + - Processes extracted links + - Returns the crawl result or None on failure + """ if self._cancel_event.is_set(): return None @@ -176,11 +200,24 @@ class BFSScraperStrategy(ScraperStrategy): visited: Set[str], depths: Dict[str, int] ): - """Process extracted links from crawl result""" - for link_type in ["internal", "external"]: + """Process extracted links from crawl result. + This is our link processor that: + Handles both internal and external links + Normalizes URLs (removes fragments) + Checks depth limits + Scores URLs for priority + Updates depth tracking + Adds valid URLs to the queue + Updates maximum depth statistics + """ + links_ro_process = result.links["internal"] + if self.process_external_links: + links_ro_process += result.links["external"] + for link_type in links_ro_process: for link in result.links[link_type]: - url = urljoin(source_url, link['href']) - url = urlunparse(urlparse(url)._replace(fragment="")) + url = link['href'] + # url = urljoin(source_url, link['href']) + # url = urlunparse(urlparse(url)._replace(fragment="")) if url not in visited and await self.can_process_url(url): new_depth = depths[source_url] + 1 @@ -202,6 +239,15 @@ class BFSScraperStrategy(ScraperStrategy): """Implement BFS crawling strategy""" # Initialize crawl state + """ + queue: A priority queue where items are tuples of (score, depth, url) + Score: Determines crawling priority (lower = higher priority) + Depth: Current distance from start_url + URL: The actual URL to crawl + visited: Keeps track of URLs we've already seen to avoid cycles + depths: Maps URLs to their depths from the start URL + pending_tasks: Tracks currently running crawl tasks + """ queue = asyncio.PriorityQueue() await queue.put((0, 0, start_url)) visited: Set[str] = set() @@ -210,8 +256,24 @@ class BFSScraperStrategy(ScraperStrategy): try: while (not queue.empty() or pending_tasks) and not self._cancel_event.is_set(): + """ + This sets up our main control loop which: + - Continues while there are URLs to process (not queue.empty()) + - Or while there are tasks still running (pending_tasks) + - Can be interrupted via cancellation (not self._cancel_event.is_set()) + """ # Start new tasks up to max_concurrent while not queue.empty() and len(pending_tasks) < self.max_concurrent: + """ + This section manages task creation: + Checks if we can start more tasks (under max_concurrent limit) + Gets the next URL from the priority queue + Marks URLs as visited immediately to prevent duplicates + Updates current depth in stats + Either: + Creates a new async task (parallel mode) + Processes URL directly (sequential mode) + """ _, depth, url = await queue.get() if url not in visited: visited.add(url) @@ -230,6 +292,13 @@ class BFSScraperStrategy(ScraperStrategy): yield result # Process completed tasks + """ + This section manages completed tasks: + Waits for any task to complete using asyncio.wait + Uses FIRST_COMPLETED to handle results as soon as they're ready + Yields successful results to the caller + Updates pending_tasks to remove completed ones + """ if pending_tasks: done, pending_tasks = await asyncio.wait( pending_tasks, diff --git a/docs/scrapper/bfs_scraper_strategy.md b/docs/scrapper/bfs_scraper_strategy.md new file mode 100644 index 00000000..7fe1319c --- /dev/null +++ b/docs/scrapper/bfs_scraper_strategy.md @@ -0,0 +1,244 @@ +# BFS Scraper Strategy: Smart Web Traversal + +The BFS (Breadth-First Search) Scraper Strategy provides an intelligent way to traverse websites systematically. It crawls websites level by level, ensuring thorough coverage while respecting web crawling etiquette. + +```mermaid +flowchart TB + Start([Start]) --> Init[Initialize BFS Strategy] + Init --> InitStats[Initialize CrawlStats] + InitStats --> InitQueue[Initialize Priority Queue] + InitQueue --> AddStart[Add Start URL to Queue] + + AddStart --> CheckState{Queue Empty or\nTasks Pending?} + CheckState -->|No| Cleanup[Cleanup & Stats] + Cleanup --> End([End]) + + CheckState -->|Yes| CheckCancel{Cancel\nRequested?} + CheckCancel -->|Yes| Cleanup + + CheckCancel -->|No| CheckConcurrent{Under Max\nConcurrent?} + + CheckConcurrent -->|No| WaitComplete[Wait for Task Completion] + WaitComplete --> YieldResult[Yield Result] + YieldResult --> CheckState + + CheckConcurrent -->|Yes| GetNextURL[Get Next URL from Queue] + + GetNextURL --> ValidateURL{Already\nVisited?} + ValidateURL -->|Yes| CheckState + + ValidateURL -->|No| ProcessURL[Process URL] + + subgraph URL_Processing [URL Processing] + ProcessURL --> CheckValid{URL Valid?} + CheckValid -->|No| UpdateStats[Update Skip Stats] + + CheckValid -->|Yes| CheckRobots{Allowed by\nrobots.txt?} + CheckRobots -->|No| UpdateRobotStats[Update Robot Stats] + + CheckRobots -->|Yes| ApplyDelay[Apply Politeness Delay] + ApplyDelay --> FetchContent[Fetch Content with Rate Limit] + + FetchContent --> CheckError{Error?} + CheckError -->|Yes| Retry{Retry\nNeeded?} + Retry -->|Yes| FetchContent + Retry -->|No| UpdateFailStats[Update Fail Stats] + + CheckError -->|No| ExtractLinks[Extract & Process Links] + ExtractLinks --> ScoreURLs[Score New URLs] + ScoreURLs --> AddToQueue[Add to Priority Queue] + end + + ProcessURL --> CreateTask{Parallel\nProcessing?} + CreateTask -->|Yes| AddTask[Add to Pending Tasks] + CreateTask -->|No| DirectProcess[Process Directly] + + AddTask --> CheckState + DirectProcess --> YieldResult + + UpdateStats --> CheckState + UpdateRobotStats --> CheckState + UpdateFailStats --> CheckState + + classDef process fill:#90caf9,stroke:#000,stroke-width:2px; + classDef decision fill:#fff59d,stroke:#000,stroke-width:2px; + classDef error fill:#ef9a9a,stroke:#000,stroke-width:2px; + classDef stats fill:#a5d6a7,stroke:#000,stroke-width:2px; + + class Start,End stats; + class CheckState,CheckCancel,CheckConcurrent,ValidateURL,CheckValid,CheckRobots,CheckError,Retry,CreateTask decision; + class UpdateStats,UpdateRobotStats,UpdateFailStats,InitStats,Cleanup stats; + class ProcessURL,FetchContent,ExtractLinks,ScoreURLs process; +``` + +## How It Works + +The BFS strategy crawls a website by: +1. Starting from a root URL +2. Processing all URLs at the current depth +3. Moving to URLs at the next depth level +4. Continuing until maximum depth is reached + +This ensures systematic coverage of the website while maintaining control over the crawling process. + +## Key Features + +### 1. Smart URL Processing +```python +strategy = BFSScraperStrategy( + max_depth=2, + filter_chain=my_filters, + url_scorer=my_scorer, + max_concurrent=5 +) +``` +- Controls crawl depth +- Filters unwanted URLs +- Scores URLs for priority +- Manages concurrent requests + +### 2. Polite Crawling +The strategy automatically implements web crawling best practices: +- Respects robots.txt +- Implements rate limiting +- Adds politeness delays +- Manages concurrent requests + +### 3. Link Processing Control +```python +strategy = BFSScraperStrategy( + ..., + process_external_links=False # Only process internal links +) +``` +- Control whether to follow external links +- Default: internal links only +- Enable external links when needed + +## Configuration Options + +| Parameter | Description | Default | +|-----------|-------------|---------| +| max_depth | Maximum crawl depth | Required | +| filter_chain | URL filtering rules | Required | +| url_scorer | URL priority scoring | Required | +| max_concurrent | Max parallel requests | 5 | +| min_crawl_delay | Seconds between requests | 1 | +| process_external_links | Follow external links | False | + +## Best Practices + +1. **Set Appropriate Depth** + - Start with smaller depths (2-3) + - Increase based on needs + - Consider site structure + +2. **Configure Filters** + - Use URL patterns + - Filter by content type + - Avoid unwanted sections + +3. **Tune Performance** + - Adjust max_concurrent + - Set appropriate delays + - Monitor resource usage + +4. **Handle External Links** + - Keep external_links=False for focused crawls + - Enable only when needed + - Consider additional filtering + +## Example Usage + +```python +from crawl4ai.scraper import BFSScraperStrategy +from crawl4ai.scraper.filters import FilterChain +from crawl4ai.scraper.scorers import BasicURLScorer + +# Configure strategy +strategy = BFSScraperStrategy( + max_depth=3, + filter_chain=FilterChain([ + URLPatternFilter("*.example.com/*"), + ContentTypeFilter(["text/html"]) + ]), + url_scorer=BasicURLScorer(), + max_concurrent=5, + min_crawl_delay=1, + process_external_links=False +) + +# Use with AsyncWebScraper +scraper = AsyncWebScraper(crawler, strategy) +results = await scraper.ascrape("https://example.com") +``` + +## Common Use Cases + +### 1. Site Mapping +```python +strategy = BFSScraperStrategy( + max_depth=5, + filter_chain=site_filter, + url_scorer=depth_scorer, + process_external_links=False +) +``` +Perfect for creating complete site maps or understanding site structure. + +### 2. Content Aggregation +```python +strategy = BFSScraperStrategy( + max_depth=2, + filter_chain=content_filter, + url_scorer=relevance_scorer, + max_concurrent=3 +) +``` +Ideal for collecting specific types of content (articles, products, etc.). + +### 3. Link Analysis +```python +strategy = BFSScraperStrategy( + max_depth=1, + filter_chain=link_filter, + url_scorer=link_scorer, + process_external_links=True +) +``` +Useful for analyzing both internal and external link structures. + +## Advanced Features + +### Progress Monitoring +```python +async for result in scraper.ascrape(url): + print(f"Current depth: {strategy.stats.current_depth}") + print(f"Processed URLs: {strategy.stats.urls_processed}") +``` + +### Custom URL Scoring +```python +class CustomScorer(URLScorer): + def score(self, url: str) -> float: + # Lower scores = higher priority + return score_based_on_criteria(url) +``` + +## Troubleshooting + +1. **Slow Crawling** + - Increase max_concurrent + - Adjust min_crawl_delay + - Check network conditions + +2. **Missing Content** + - Verify max_depth + - Check filter settings + - Review URL patterns + +3. **High Resource Usage** + - Reduce max_concurrent + - Increase crawl delay + - Add more specific filters + From bae4665949d7a9c52fa9649f58460bac85fb3e69 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 8 Nov 2024 18:45:12 +0800 Subject: [PATCH 15/28] feat(scraper): Enhance URL filtering and scoring systems Implement comprehensive URL filtering and scoring capabilities: Filters: - Add URLPatternFilter with glob/regex support - Implement ContentTypeFilter with MIME type checking - Add DomainFilter for domain control - Create FilterChain with stats tracking Scorers: - Complete KeywordRelevanceScorer implementation - Add PathDepthScorer for URL structure scoring - Implement ContentTypeScorer for file type priorities - Add FreshnessScorer for date-based scoring - Add DomainAuthorityScorer for domain weighting - Create CompositeScorer for combined strategies Features: - Add statistics tracking for both filters and scorers - Implement logging support throughout - Add resource cleanup methods - Create comprehensive documentation - Include performance optimizations Tests and docs included. Note: Review URL normalization overlap with recent crawler changes. - Quick Start is created and added --- crawl4ai/scraper/bfs_scraper_strategy copy.py | 138 ------- crawl4ai/scraper/filters/__init__.py | 208 ++++++++++- .../scraper/filters/content_type_filter.py | 43 ++- crawl4ai/scraper/filters/url_filter.py | 68 +++- .../scraper/filters/url_pattern_filter.py | 38 +- crawl4ai/scraper/scorers/__init__.py | 270 +++++++++++++- docs/scrapper/filters_scrorers.md | 342 ++++++++++++++++++ docs/scrapper/how_to_use.md | 206 +++++++++++ docs/scrapper/web_crawler_quick_start.py | 111 ++++++ tests/test_scraper.py | 184 ++++++++++ 10 files changed, 1451 insertions(+), 157 deletions(-) delete mode 100644 crawl4ai/scraper/bfs_scraper_strategy copy.py create mode 100644 docs/scrapper/filters_scrorers.md create mode 100644 docs/scrapper/how_to_use.md create mode 100644 docs/scrapper/web_crawler_quick_start.py create mode 100644 tests/test_scraper.py diff --git a/crawl4ai/scraper/bfs_scraper_strategy copy.py b/crawl4ai/scraper/bfs_scraper_strategy copy.py deleted file mode 100644 index 51bf9cb3..00000000 --- a/crawl4ai/scraper/bfs_scraper_strategy copy.py +++ /dev/null @@ -1,138 +0,0 @@ -from .scraper_strategy import ScraperStrategy -from .filters import FilterChain -from .scorers import URLScorer -from ..models import CrawlResult -from ..async_webcrawler import AsyncWebCrawler -import asyncio -import validators -from urllib.parse import urljoin,urlparse,urlunparse -from urllib.robotparser import RobotFileParser -import time -from aiolimiter import AsyncLimiter -from tenacity import retry, stop_after_attempt, wait_exponential -from collections import defaultdict -import logging -from typing import Dict, AsyncGenerator -logging.basicConfig(level=logging.DEBUG) - -rate_limiter = AsyncLimiter(1, 1) # 1 request per second - -class BFSScraperStrategy(ScraperStrategy): - def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer, max_concurrent: int = 5, min_crawl_delay: int=1): - self.max_depth = max_depth - self.filter_chain = filter_chain - self.url_scorer = url_scorer - self.max_concurrent = max_concurrent - # For Crawl Politeness - self.last_crawl_time = defaultdict(float) - self.min_crawl_delay = min_crawl_delay # 1 second delay between requests to the same domain - # For Robots.txt Compliance - self.robot_parsers = {} - - # Robots.txt Parser - def get_robot_parser(self, url: str) -> RobotFileParser: - domain = urlparse(url) - scheme = domain.scheme if domain.scheme else 'http' # Default to 'http' if no scheme provided - netloc = domain.netloc - if netloc not in self.robot_parsers: - rp = RobotFileParser() - rp.set_url(f"{scheme}://{netloc}/robots.txt") - try: - rp.read() - except Exception as e: - # Log the type of error, message, and the URL - logging.warning(f"Error {type(e).__name__} occurred while fetching robots.txt for {netloc}: {e}") - return None - self.robot_parsers[netloc] = rp - return self.robot_parsers[netloc] - - # Retry with exponential backoff - @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) - async def retry_crawl(self, crawler: AsyncWebCrawler, url: str) -> CrawlResult: - return await crawler.arun(url) - - async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set, depths: Dict[str, int]) -> AsyncGenerator[CrawlResult, None]: - def normalize_url(url: str) -> str: - parsed = urlparse(url) - return urlunparse(parsed._replace(fragment="")) - - # URL Validation - if not validators.url(url): - logging.warning(f"Invalid URL: {url}") - return None - - # Robots.txt Compliance - robot_parser = self.get_robot_parser(url) - if robot_parser is None: - logging.info(f"Could not retrieve robots.txt for {url}, hence proceeding with crawl.") - else: - # If robots.txt was fetched, check if crawling is allowed - if not robot_parser.can_fetch(crawler.crawler_strategy.user_agent, url): - logging.info(f"Skipping {url} as per robots.txt") - return None - - # Crawl Politeness - domain = urlparse(url).netloc - time_since_last_crawl = time.time() - self.last_crawl_time[domain] - if time_since_last_crawl < self.min_crawl_delay: - await asyncio.sleep(self.min_crawl_delay - time_since_last_crawl) - self.last_crawl_time[domain] = time.time() - - # Rate Limiting - async with rate_limiter: - # Error Handling - try: - crawl_result = await self.retry_crawl(crawler, url) - except Exception as e: - logging.error(f"Error crawling {url}: {str(e)}") - crawl_result = CrawlResult(url=url, html="", success=False, status_code=0, error_message=str(e)) - - if not crawl_result.success: - # Logging and Monitoring - logging.error(f"Failed to crawl URL: {url}. Error: {crawl_result.error_message}") - return crawl_result - - # Process links - for link_type in ["internal", "external"]: - for link in crawl_result.links[link_type]: - absolute_link = urljoin(url, link['href']) - normalized_link = normalize_url(absolute_link) - if self.filter_chain.apply(normalized_link) and normalized_link not in visited: - new_depth = depths[url] + 1 - if new_depth <= self.max_depth: - # URL Scoring - score = self.url_scorer.score(normalized_link) - await queue.put((score, new_depth, normalized_link)) - depths[normalized_link] = new_depth - return crawl_result - - async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> AsyncGenerator[CrawlResult,None]: - queue = asyncio.PriorityQueue() - queue.put_nowait((0, 0, start_url)) - visited = set() - depths = {start_url: 0} - pending_tasks = set() - - while not queue.empty() or pending_tasks: - while not queue.empty() and len(pending_tasks) < self.max_concurrent: - _, depth, url = await queue.get() - if url not in visited: - # Adding URL to the visited set here itself, (instead of after result generation) - # so that other tasks are not queued for same URL, found at different depth before - # crawling and extraction of this task is completed. - visited.add(url) - if parallel_processing: - task = asyncio.create_task(self.process_url(url, depth, crawler, queue, visited, depths)) - pending_tasks.add(task) - else: - result = await self.process_url(url, depth, crawler, queue, visited, depths) - if result: - yield result - - # Wait for the first task to complete and yield results incrementally as each task is completed - if pending_tasks: - done, pending_tasks = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED) - for task in done: - result = await task - if result: - yield result \ No newline at end of file diff --git a/crawl4ai/scraper/filters/__init__.py b/crawl4ai/scraper/filters/__init__.py index 525c9bdb..df5d13aa 100644 --- a/crawl4ai/scraper/filters/__init__.py +++ b/crawl4ai/scraper/filters/__init__.py @@ -1,3 +1,205 @@ -from .url_filter import URLFilter, FilterChain -from .content_type_filter import ContentTypeFilter -from .url_pattern_filter import URLPatternFilter \ No newline at end of file +# from .url_filter import URLFilter, FilterChain +# from .content_type_filter import ContentTypeFilter +# from .url_pattern_filter import URLPatternFilter + +from abc import ABC, abstractmethod +from typing import List, Pattern, Set, Union +import re +from urllib.parse import urlparse +import mimetypes +import logging +from dataclasses import dataclass +import fnmatch + +@dataclass +class FilterStats: + """Statistics for filter applications""" + total_urls: int = 0 + rejected_urls: int = 0 + passed_urls: int = 0 + +class URLFilter(ABC): + """Base class for URL filters""" + + def __init__(self, name: str = None): + self.name = name or self.__class__.__name__ + self.stats = FilterStats() + self.logger = logging.getLogger(f"urlfilter.{self.name}") + + @abstractmethod + def apply(self, url: str) -> bool: + """Apply the filter to a URL""" + pass + + def _update_stats(self, passed: bool): + """Update filter statistics""" + self.stats.total_urls += 1 + if passed: + self.stats.passed_urls += 1 + else: + self.stats.rejected_urls += 1 + +class FilterChain: + """Chain of URL filters.""" + + def __init__(self, filters: List[URLFilter] = None): + self.filters = filters or [] + self.stats = FilterStats() + self.logger = logging.getLogger("urlfilter.chain") + + def add_filter(self, filter_: URLFilter) -> 'FilterChain': + """Add a filter to the chain""" + self.filters.append(filter_) + return self # Enable method chaining + + def apply(self, url: str) -> bool: + """Apply all filters in the chain""" + self.stats.total_urls += 1 + + for filter_ in self.filters: + if not filter_.apply(url): + self.stats.rejected_urls += 1 + self.logger.debug(f"URL {url} rejected by {filter_.name}") + return False + + self.stats.passed_urls += 1 + return True + +class URLPatternFilter(URLFilter): + """Filter URLs based on glob patterns or regex. + + pattern_filter = URLPatternFilter([ + "*.example.com/*", # Glob pattern + "*/article/*", # Path pattern + re.compile(r"blog-\d+") # Regex pattern + ]) + + - Supports glob patterns and regex + - Multiple patterns per filter + - Pattern pre-compilation for performance + """ + + def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], + use_glob: bool = True): + super().__init__() + self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns + self.use_glob = use_glob + self._compiled_patterns = [] + + for pattern in self.patterns: + if isinstance(pattern, str) and use_glob: + self._compiled_patterns.append(self._glob_to_regex(pattern)) + else: + self._compiled_patterns.append(re.compile(pattern) if isinstance(pattern, str) else pattern) + + def _glob_to_regex(self, pattern: str) -> Pattern: + """Convert glob pattern to regex""" + return re.compile(fnmatch.translate(pattern)) + + def apply(self, url: str) -> bool: + """Check if URL matches any of the patterns""" + matches = any(pattern.search(url) for pattern in self._compiled_patterns) + self._update_stats(matches) + return matches + +class ContentTypeFilter(URLFilter): + """Filter URLs based on expected content type. + + content_filter = ContentTypeFilter([ + "text/html", + "application/pdf" + ], check_extension=True) + + - Filter by MIME types + - Extension checking + - Support for multiple content types + """ + + def __init__(self, allowed_types: Union[str, List[str]], + check_extension: bool = True): + super().__init__() + self.allowed_types = [allowed_types] if isinstance(allowed_types, str) else allowed_types + self.check_extension = check_extension + self._normalize_types() + + def _normalize_types(self): + """Normalize content type strings""" + self.allowed_types = [t.lower() for t in self.allowed_types] + + def _check_extension(self, url: str) -> bool: + """Check URL's file extension""" + ext = urlparse(url).path.split('.')[-1].lower() if '.' in urlparse(url).path else '' + if not ext: + return True # No extension, might be dynamic content + + guessed_type = mimetypes.guess_type(url)[0] + return any(allowed in (guessed_type or '').lower() for allowed in self.allowed_types) + + def apply(self, url: str) -> bool: + """Check if URL's content type is allowed""" + result = True + if self.check_extension: + result = self._check_extension(url) + self._update_stats(result) + return result + +class DomainFilter(URLFilter): + """Filter URLs based on allowed/blocked domains. + + domain_filter = DomainFilter( + allowed_domains=["example.com", "blog.example.com"], + blocked_domains=["ads.example.com"] + ) + + - Allow/block specific domains + - Subdomain support + - Efficient domain matching + """ + + def __init__(self, allowed_domains: Union[str, List[str]] = None, + blocked_domains: Union[str, List[str]] = None): + super().__init__() + self.allowed_domains = set(self._normalize_domains(allowed_domains)) if allowed_domains else None + self.blocked_domains = set(self._normalize_domains(blocked_domains)) if blocked_domains else set() + + def _normalize_domains(self, domains: Union[str, List[str]]) -> List[str]: + """Normalize domain strings""" + if isinstance(domains, str): + domains = [domains] + return [d.lower().strip() for d in domains] + + def _extract_domain(self, url: str) -> str: + """Extract domain from URL""" + return urlparse(url).netloc.lower() + + def apply(self, url: str) -> bool: + """Check if URL's domain is allowed""" + domain = self._extract_domain(url) + + if domain in self.blocked_domains: + self._update_stats(False) + return False + + if self.allowed_domains is not None and domain not in self.allowed_domains: + self._update_stats(False) + return False + + self._update_stats(True) + return True + +# Example usage: +def create_common_filter_chain() -> FilterChain: + """Create a commonly used filter chain""" + return FilterChain([ + URLPatternFilter([ + "*.html", "*.htm", # HTML files + "*/article/*", "*/blog/*" # Common content paths + ]), + ContentTypeFilter([ + "text/html", + "application/xhtml+xml" + ]), + DomainFilter( + blocked_domains=["ads.*", "analytics.*"] + ) + ]) \ No newline at end of file diff --git a/crawl4ai/scraper/filters/content_type_filter.py b/crawl4ai/scraper/filters/content_type_filter.py index 9173eb4a..6966afdb 100644 --- a/crawl4ai/scraper/filters/content_type_filter.py +++ b/crawl4ai/scraper/filters/content_type_filter.py @@ -1,8 +1,43 @@ from .url_filter import URLFilter +from typing import List, Union +from urllib.parse import urlparse +import mimetypes + class ContentTypeFilter(URLFilter): - def __init__(self, contentType: str): - self.contentType = contentType + """Filter URLs based on expected content type""" + + def __init__(self, allowed_types: Union[str, List[str]], + check_extension: bool = True): + super().__init__() + self.allowed_types = [allowed_types] if isinstance(allowed_types, str) else allowed_types + self.check_extension = check_extension + self._normalize_types() + + def _normalize_types(self): + """Normalize content type strings""" + self.allowed_types = [t.lower() for t in self.allowed_types] + + def _check_extension(self, url: str) -> bool: + """Check URL's file extension""" + ext = urlparse(url).path.split('.')[-1].lower() if '.' in urlparse(url).path else '' + if not ext: + return True # No extension, might be dynamic content + + guessed_type = mimetypes.guess_type(url)[0] + return any(allowed in (guessed_type or '').lower() for allowed in self.allowed_types) + def apply(self, url: str) -> bool: - #TODO: This is a stub. Will implement this later - return True \ No newline at end of file + """Check if URL's content type is allowed""" + result = True + if self.check_extension: + result = self._check_extension(url) + self._update_stats(result) + return result + +# class ContentTypeFilter(URLFilter): +# def __init__(self, contentType: str): +# self.contentType = contentType +# def apply(self, url: str) -> bool: +# #TODO: This is a stub. Will implement this later +# return True \ No newline at end of file diff --git a/crawl4ai/scraper/filters/url_filter.py b/crawl4ai/scraper/filters/url_filter.py index 2b8bd6eb..88a2c60a 100644 --- a/crawl4ai/scraper/filters/url_filter.py +++ b/crawl4ai/scraper/filters/url_filter.py @@ -1,16 +1,72 @@ from abc import ABC, abstractmethod +from dataclasses import dataclass +import logging +from typing import List +@dataclass +class FilterStats: + """Statistics for filter applications""" + total_urls: int = 0 + rejected_urls: int = 0 + passed_urls: int = 0 class URLFilter(ABC): + """Base class for URL filters""" + + def __init__(self, name: str = None): + self.name = name or self.__class__.__name__ + self.stats = FilterStats() + self.logger = logging.getLogger(f"urlfilter.{self.name}") + @abstractmethod def apply(self, url: str) -> bool: + """Apply the filter to a URL""" pass -class FilterChain: - def __init__(self): - self.filters = [] + def _update_stats(self, passed: bool): + """Update filter statistics""" + self.stats.total_urls += 1 + if passed: + self.stats.passed_urls += 1 + else: + self.stats.rejected_urls += 1 - def add_filter(self, filter: URLFilter): - self.filters.append(filter) +class FilterChain: + """Chain of URL filters""" + + def __init__(self, filters: List[URLFilter] = None): + self.filters = filters or [] + self.stats = FilterStats() + self.logger = logging.getLogger("urlfilter.chain") + + def add_filter(self, filter_: URLFilter) -> 'FilterChain': + """Add a filter to the chain""" + self.filters.append(filter_) + return self # Enable method chaining def apply(self, url: str) -> bool: - return all(filter.apply(url) for filter in self.filters) \ No newline at end of file + """Apply all filters in the chain""" + self.stats.total_urls += 1 + + for filter_ in self.filters: + if not filter_.apply(url): + self.stats.rejected_urls += 1 + self.logger.debug(f"URL {url} rejected by {filter_.name}") + return False + + self.stats.passed_urls += 1 + return True + +# class URLFilter(ABC): +# @abstractmethod +# def apply(self, url: str) -> bool: +# pass + +# class FilterChain: +# def __init__(self): +# self.filters = [] + +# def add_filter(self, filter: URLFilter): +# self.filters.append(filter) + +# def apply(self, url: str) -> bool: +# return all(filter.apply(url) for filter in self.filters) \ No newline at end of file diff --git a/crawl4ai/scraper/filters/url_pattern_filter.py b/crawl4ai/scraper/filters/url_pattern_filter.py index fd5df133..1e02b4a6 100644 --- a/crawl4ai/scraper/filters/url_pattern_filter.py +++ b/crawl4ai/scraper/filters/url_pattern_filter.py @@ -1,9 +1,39 @@ from .url_filter import URLFilter from re import Pattern +from typing import List, Union +import re +import fnmatch + class URLPatternFilter(URLFilter): - def __init__(self, pattern: Pattern): - self.pattern = pattern + """Filter URLs based on glob patterns or regex""" + + def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], + use_glob: bool = True): + super().__init__() + self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns + self.use_glob = use_glob + self._compiled_patterns = [] + + for pattern in self.patterns: + if isinstance(pattern, str) and use_glob: + self._compiled_patterns.append(self._glob_to_regex(pattern)) + else: + self._compiled_patterns.append(re.compile(pattern) if isinstance(pattern, str) else pattern) + + def _glob_to_regex(self, pattern: str) -> Pattern: + """Convert glob pattern to regex""" + return re.compile(fnmatch.translate(pattern)) + def apply(self, url: str) -> bool: - #TODO: This is a stub. Will implement this later. - return True \ No newline at end of file + """Check if URL matches any of the patterns""" + matches = any(pattern.search(url) for pattern in self._compiled_patterns) + self._update_stats(matches) + return matches + +# class URLPatternFilter(URLFilter): +# def __init__(self, pattern: Pattern): +# self.pattern = pattern +# def apply(self, url: str) -> bool: +# #TODO: This is a stub. Will implement this later. +# return True \ No newline at end of file diff --git a/crawl4ai/scraper/scorers/__init__.py b/crawl4ai/scraper/scorers/__init__.py index 05c61c94..548b80f0 100644 --- a/crawl4ai/scraper/scorers/__init__.py +++ b/crawl4ai/scraper/scorers/__init__.py @@ -1,2 +1,268 @@ -from .url_scorer import URLScorer -from .keyword_relevance_scorer import KeywordRelevanceScorer \ No newline at end of file +# from .url_scorer import URLScorer +# from .keyword_relevance_scorer import KeywordRelevanceScorer + +from abc import ABC, abstractmethod +from typing import List, Dict, Optional, Union +from dataclasses import dataclass +from urllib.parse import urlparse, unquote +import re +from collections import defaultdict +import math +import logging + +@dataclass +class ScoringStats: + """Statistics for URL scoring""" + urls_scored: int = 0 + total_score: float = 0.0 + min_score: float = float('inf') + max_score: float = float('-inf') + + def update(self, score: float): + """Update scoring statistics""" + self.urls_scored += 1 + self.total_score += score + self.min_score = min(self.min_score, score) + self.max_score = max(self.max_score, score) + + @property + def average_score(self) -> float: + """Calculate average score""" + return self.total_score / self.urls_scored if self.urls_scored > 0 else 0.0 + +class URLScorer(ABC): + """Base class for URL scoring strategies""" + + def __init__(self, weight: float = 1.0, name: str = None): + self.weight = weight + self.name = name or self.__class__.__name__ + self.stats = ScoringStats() + self.logger = logging.getLogger(f"urlscorer.{self.name}") + + @abstractmethod + def _calculate_score(self, url: str) -> float: + """Calculate the raw score for a URL""" + pass + + def score(self, url: str) -> float: + """Calculate the weighted score for a URL""" + raw_score = self._calculate_score(url) + weighted_score = raw_score * self.weight + self.stats.update(weighted_score) + return weighted_score + +class CompositeScorer(URLScorer): + """Combines multiple scorers with weights""" + + def __init__(self, scorers: List[URLScorer], normalize: bool = True): + super().__init__(name="CompositeScorer") + self.scorers = scorers + self.normalize = normalize + + def _calculate_score(self, url: str) -> float: + scores = [scorer.score(url) for scorer in self.scorers] + total_score = sum(scores) + + if self.normalize and scores: + total_score /= len(scores) + + return total_score + +class KeywordRelevanceScorer(URLScorer): + """Score URLs based on keyword relevance. + + keyword_scorer = KeywordRelevanceScorer( + keywords=["python", "programming"], + weight=1.0, + case_sensitive=False + ) + + - Score based on keyword matches + - Case sensitivity options + - Weighted scoring + """ + + def __init__(self, keywords: List[str], weight: float = 1.0, + case_sensitive: bool = False): + super().__init__(weight=weight) + self.keywords = keywords + self.case_sensitive = case_sensitive + self._compile_keywords() + + def _compile_keywords(self): + """Prepare keywords for matching""" + flags = 0 if self.case_sensitive else re.IGNORECASE + self.patterns = [re.compile(re.escape(k), flags) for k in self.keywords] + + def _calculate_score(self, url: str) -> float: + """Calculate score based on keyword matches""" + decoded_url = unquote(url) + total_matches = sum( + 1 for pattern in self.patterns + if pattern.search(decoded_url) + ) + # Normalize score between 0 and 1 + return total_matches / len(self.patterns) if self.patterns else 0.0 + +class PathDepthScorer(URLScorer): + """Score URLs based on their path depth. + + path_scorer = PathDepthScorer( + optimal_depth=3, # Preferred URL depth + weight=0.7 + ) + + - Score based on URL path depth + - Configurable optimal depth + - Diminishing returns for deeper paths + """ + + def __init__(self, optimal_depth: int = 3, weight: float = 1.0): + super().__init__(weight=weight) + self.optimal_depth = optimal_depth + + def _calculate_score(self, url: str) -> float: + """Calculate score based on path depth""" + path = urlparse(url).path + depth = len([x for x in path.split('/') if x]) + + # Score decreases as we move away from optimal depth + distance_from_optimal = abs(depth - self.optimal_depth) + return 1.0 / (1.0 + distance_from_optimal) + +class ContentTypeScorer(URLScorer): + """Score URLs based on content type preferences. + + content_scorer = ContentTypeScorer({ + r'\.html$': 1.0, + r'\.pdf$': 0.8, + r'\.xml$': 0.6 + }) + + - Score based on file types + - Configurable type weights + - Pattern matching support + """ + + def __init__(self, type_weights: Dict[str, float], weight: float = 1.0): + super().__init__(weight=weight) + self.type_weights = type_weights + self._compile_patterns() + + def _compile_patterns(self): + """Prepare content type patterns""" + self.patterns = { + re.compile(pattern): weight + for pattern, weight in self.type_weights.items() + } + + def _calculate_score(self, url: str) -> float: + """Calculate score based on content type matching""" + for pattern, weight in self.patterns.items(): + if pattern.search(url): + return weight + return 0.0 + +class FreshnessScorer(URLScorer): + """Score URLs based on freshness indicators. + + freshness_scorer = FreshnessScorer(weight=0.9) + + Score based on date indicators in URLs + Multiple date format support + Recency weighting""" + + def __init__(self, weight: float = 1.0): + super().__init__(weight=weight) + self.date_patterns = [ + r'/(\d{4})/(\d{2})/(\d{2})/', # yyyy/mm/dd + r'(\d{4})[-_](\d{2})[-_](\d{2})', # yyyy-mm-dd + r'/(\d{4})/', # year only + ] + self._compile_patterns() + + def _compile_patterns(self): + """Prepare date patterns""" + self.compiled_patterns = [re.compile(p) for p in self.date_patterns] + + def _calculate_score(self, url: str) -> float: + """Calculate score based on date indicators""" + for pattern in self.compiled_patterns: + if match := pattern.search(url): + year = int(match.group(1)) + # Score higher for more recent years + return 1.0 - (2024 - year) * 0.1 + return 0.5 # Default score for URLs without dates + +class DomainAuthorityScorer(URLScorer): + """Score URLs based on domain authority. + + authority_scorer = DomainAuthorityScorer({ + "python.org": 1.0, + "github.com": 0.9, + "medium.com": 0.7 + }) + + Score based on domain importance + Configurable domain weights + Default weight for unknown domains""" + + def __init__(self, domain_weights: Dict[str, float], + default_weight: float = 0.5, weight: float = 1.0): + super().__init__(weight=weight) + self.domain_weights = domain_weights + self.default_weight = default_weight + + def _calculate_score(self, url: str) -> float: + """Calculate score based on domain authority""" + domain = urlparse(url).netloc.lower() + return self.domain_weights.get(domain, self.default_weight) + +def create_balanced_scorer() -> CompositeScorer: + """Create a balanced composite scorer""" + return CompositeScorer([ + KeywordRelevanceScorer( + keywords=["article", "blog", "news", "research"], + weight=1.0 + ), + PathDepthScorer( + optimal_depth=3, + weight=0.7 + ), + ContentTypeScorer( + type_weights={ + r'\.html?$': 1.0, + r'\.pdf$': 0.8, + r'\.xml$': 0.6 + }, + weight=0.8 + ), + FreshnessScorer( + weight=0.9 + ) + ]) + +# Example Usage: +""" +# Create a composite scorer +scorer = CompositeScorer([ + KeywordRelevanceScorer(["python", "programming"], weight=1.0), + PathDepthScorer(optimal_depth=2, weight=0.7), + FreshnessScorer(weight=0.8), + DomainAuthorityScorer( + domain_weights={ + "python.org": 1.0, + "github.com": 0.9, + "medium.com": 0.7 + }, + weight=0.9 + ) +]) + +# Score a URL +score = scorer.score("https://python.org/article/2024/01/new-features") + +# Access statistics +print(f"Average score: {scorer.stats.average_score}") +print(f"URLs scored: {scorer.stats.urls_scored}") +""" \ No newline at end of file diff --git a/docs/scrapper/filters_scrorers.md b/docs/scrapper/filters_scrorers.md new file mode 100644 index 00000000..22b846c6 --- /dev/null +++ b/docs/scrapper/filters_scrorers.md @@ -0,0 +1,342 @@ +# URL Filters and Scorers + +The crawl4ai library provides powerful URL filtering and scoring capabilities that help you control and prioritize your web crawling. This guide explains how to use these features effectively. + +```mermaid +flowchart TB + Start([URL Input]) --> Chain[Filter Chain] + + subgraph Chain Process + Chain --> Pattern{URL Pattern\nFilter} + Pattern -->|Match| Content{Content Type\nFilter} + Pattern -->|No Match| Reject1[Reject URL] + + Content -->|Allowed| Domain{Domain\nFilter} + Content -->|Not Allowed| Reject2[Reject URL] + + Domain -->|Allowed| Accept[Accept URL] + Domain -->|Blocked| Reject3[Reject URL] + end + + subgraph Statistics + Pattern --> UpdatePattern[Update Pattern Stats] + Content --> UpdateContent[Update Content Stats] + Domain --> UpdateDomain[Update Domain Stats] + Accept --> UpdateChain[Update Chain Stats] + Reject1 --> UpdateChain + Reject2 --> UpdateChain + Reject3 --> UpdateChain + end + + Accept --> End([End]) + Reject1 --> End + Reject2 --> End + Reject3 --> End + + classDef process fill:#90caf9,stroke:#000,stroke-width:2px; + classDef decision fill:#fff59d,stroke:#000,stroke-width:2px; + classDef reject fill:#ef9a9a,stroke:#000,stroke-width:2px; + classDef accept fill:#a5d6a7,stroke:#000,stroke-width:2px; + + class Start,End accept; + class Pattern,Content,Domain decision; + class Reject1,Reject2,Reject3 reject; + class Chain,UpdatePattern,UpdateContent,UpdateDomain,UpdateChain process; +``` + +## URL Filters + +URL filters help you control which URLs are crawled. Multiple filters can be chained together to create sophisticated filtering rules. + +### Available Filters + +1. **URL Pattern Filter** +```python +pattern_filter = URLPatternFilter([ + "*.example.com/*", # Glob pattern + "*/article/*", # Path pattern + re.compile(r"blog-\d+") # Regex pattern +]) +``` +- Supports glob patterns and regex +- Multiple patterns per filter +- Pattern pre-compilation for performance + +2. **Content Type Filter** +```python +content_filter = ContentTypeFilter([ + "text/html", + "application/pdf" +], check_extension=True) +``` +- Filter by MIME types +- Extension checking +- Support for multiple content types + +3. **Domain Filter** +```python +domain_filter = DomainFilter( + allowed_domains=["example.com", "blog.example.com"], + blocked_domains=["ads.example.com"] +) +``` +- Allow/block specific domains +- Subdomain support +- Efficient domain matching + +### Creating Filter Chains + +```python +# Create and configure a filter chain +filter_chain = FilterChain([ + URLPatternFilter(["*.example.com/*"]), + ContentTypeFilter(["text/html"]), + DomainFilter(blocked_domains=["ads.*"]) +]) + +# Add more filters +filter_chain.add_filter( + URLPatternFilter(["*/article/*"]) +) +``` + +```mermaid +flowchart TB + Start([URL Input]) --> Composite[Composite Scorer] + + subgraph Scoring Process + Composite --> Keywords[Keyword Relevance] + Composite --> Path[Path Depth] + Composite --> Content[Content Type] + Composite --> Fresh[Freshness] + Composite --> Domain[Domain Authority] + + Keywords --> KeywordScore[Calculate Score] + Path --> PathScore[Calculate Score] + Content --> ContentScore[Calculate Score] + Fresh --> FreshScore[Calculate Score] + Domain --> DomainScore[Calculate Score] + + KeywordScore --> Weight1[Apply Weight] + PathScore --> Weight2[Apply Weight] + ContentScore --> Weight3[Apply Weight] + FreshScore --> Weight4[Apply Weight] + DomainScore --> Weight5[Apply Weight] + end + + Weight1 --> Combine[Combine Scores] + Weight2 --> Combine + Weight3 --> Combine + Weight4 --> Combine + Weight5 --> Combine + + Combine --> Normalize{Normalize?} + Normalize -->|Yes| NormalizeScore[Normalize Combined Score] + Normalize -->|No| FinalScore[Final Score] + NormalizeScore --> FinalScore + + FinalScore --> Stats[Update Statistics] + Stats --> End([End]) + + classDef process fill:#90caf9,stroke:#000,stroke-width:2px; + classDef scorer fill:#fff59d,stroke:#000,stroke-width:2px; + classDef calc fill:#a5d6a7,stroke:#000,stroke-width:2px; + classDef decision fill:#ef9a9a,stroke:#000,stroke-width:2px; + + class Start,End calc; + class Keywords,Path,Content,Fresh,Domain scorer; + class KeywordScore,PathScore,ContentScore,FreshScore,DomainScore process; + class Normalize decision; +``` + +## URL Scorers + +URL scorers help prioritize which URLs to crawl first. Higher scores indicate higher priority. + +### Available Scorers + +1. **Keyword Relevance Scorer** +```python +keyword_scorer = KeywordRelevanceScorer( + keywords=["python", "programming"], + weight=1.0, + case_sensitive=False +) +``` +- Score based on keyword matches +- Case sensitivity options +- Weighted scoring + +2. **Path Depth Scorer** +```python +path_scorer = PathDepthScorer( + optimal_depth=3, # Preferred URL depth + weight=0.7 +) +``` +- Score based on URL path depth +- Configurable optimal depth +- Diminishing returns for deeper paths + +3. **Content Type Scorer** +```python +content_scorer = ContentTypeScorer({ + r'\.html$': 1.0, + r'\.pdf$': 0.8, + r'\.xml$': 0.6 +}) +``` +- Score based on file types +- Configurable type weights +- Pattern matching support + +4. **Freshness Scorer** +```python +freshness_scorer = FreshnessScorer(weight=0.9) +``` +- Score based on date indicators in URLs +- Multiple date format support +- Recency weighting + +5. **Domain Authority Scorer** +```python +authority_scorer = DomainAuthorityScorer({ + "python.org": 1.0, + "github.com": 0.9, + "medium.com": 0.7 +}) +``` +- Score based on domain importance +- Configurable domain weights +- Default weight for unknown domains + +### Combining Scorers + +```python +# Create a composite scorer +composite_scorer = CompositeScorer([ + KeywordRelevanceScorer(["python"], weight=1.0), + PathDepthScorer(optimal_depth=2, weight=0.7), + FreshnessScorer(weight=0.8) +], normalize=True) +``` + +## Best Practices + +### Filter Configuration + +1. **Start Restrictive** + ```python + # Begin with strict filters + filter_chain = FilterChain([ + DomainFilter(allowed_domains=["example.com"]), + ContentTypeFilter(["text/html"]) + ]) + ``` + +2. **Layer Filters** + ```python + # Add more specific filters + filter_chain.add_filter( + URLPatternFilter(["*/article/*", "*/blog/*"]) + ) + ``` + +3. **Monitor Filter Statistics** + ```python + # Check filter performance + for filter in filter_chain.filters: + print(f"{filter.name}: {filter.stats.rejected_urls} rejected") + ``` + +### Scorer Configuration + +1. **Balance Weights** + ```python + # Balanced scoring configuration + scorer = create_balanced_scorer() + ``` + +2. **Customize for Content** + ```python + # News site configuration + news_scorer = CompositeScorer([ + KeywordRelevanceScorer(["news", "article"], weight=1.0), + FreshnessScorer(weight=1.0), + PathDepthScorer(optimal_depth=2, weight=0.5) + ]) + ``` + +3. **Monitor Scoring Statistics** + ```python + # Check scoring distribution + print(f"Average score: {scorer.stats.average_score}") + print(f"Score range: {scorer.stats.min_score} - {scorer.stats.max_score}") + ``` + +## Common Use Cases + +### Blog Crawling +```python +blog_config = { + 'filters': FilterChain([ + URLPatternFilter(["*/blog/*", "*/post/*"]), + ContentTypeFilter(["text/html"]) + ]), + 'scorer': CompositeScorer([ + FreshnessScorer(weight=1.0), + KeywordRelevanceScorer(["blog", "article"], weight=0.8) + ]) +} +``` + +### Documentation Sites +```python +docs_config = { + 'filters': FilterChain([ + URLPatternFilter(["*/docs/*", "*/guide/*"]), + ContentTypeFilter(["text/html", "application/pdf"]) + ]), + 'scorer': CompositeScorer([ + PathDepthScorer(optimal_depth=3, weight=1.0), + KeywordRelevanceScorer(["guide", "tutorial"], weight=0.9) + ]) +} +``` + +### E-commerce Sites +```python +ecommerce_config = { + 'filters': FilterChain([ + URLPatternFilter(["*/product/*", "*/category/*"]), + DomainFilter(blocked_domains=["ads.*", "tracker.*"]) + ]), + 'scorer': CompositeScorer([ + PathDepthScorer(optimal_depth=2, weight=1.0), + ContentTypeScorer({ + r'/product/': 1.0, + r'/category/': 0.8 + }) + ]) +} +``` + +## Advanced Topics + +### Custom Filters +```python +class CustomFilter(URLFilter): + def apply(self, url: str) -> bool: + # Your custom filtering logic + return True +``` + +### Custom Scorers +```python +class CustomScorer(URLScorer): + def _calculate_score(self, url: str) -> float: + # Your custom scoring logic + return 1.0 +``` + +For more examples, check our [example repository](https://github.com/example/crawl4ai/examples). \ No newline at end of file diff --git a/docs/scrapper/how_to_use.md b/docs/scrapper/how_to_use.md new file mode 100644 index 00000000..79f7912f --- /dev/null +++ b/docs/scrapper/how_to_use.md @@ -0,0 +1,206 @@ +# Scraper Examples Guide + +This guide provides two complete examples of using the crawl4ai scraper: a basic implementation for simple use cases and an advanced implementation showcasing all features. + +## Basic Example + +The basic example demonstrates a simple blog scraping scenario: + +```python +from crawl4ai.scraper import AsyncWebScraper, BFSScraperStrategy, FilterChain + +# Create simple filter chain +filter_chain = FilterChain([ + URLPatternFilter("*/blog/*"), + ContentTypeFilter(["text/html"]) +]) + +# Initialize strategy +strategy = BFSScraperStrategy( + max_depth=2, + filter_chain=filter_chain, + url_scorer=None, + max_concurrent=3 +) + +# Create and run scraper +crawler = AsyncWebCrawler() +scraper = AsyncWebScraper(crawler, strategy) +result = await scraper.ascrape("https://example.com/blog/") +``` + +### Features Demonstrated +- Basic URL filtering +- Simple content type filtering +- Depth control +- Concurrent request limiting +- Result collection + +## Advanced Example + +The advanced example shows a sophisticated news site scraping setup with all features enabled: + +```python +# Create comprehensive filter chain +filter_chain = FilterChain([ + DomainFilter( + allowed_domains=["example.com"], + blocked_domains=["ads.example.com"] + ), + URLPatternFilter([ + "*/article/*", + re.compile(r"\d{4}/\d{2}/.*") + ]), + ContentTypeFilter(["text/html"]) +]) + +# Create intelligent scorer +scorer = CompositeScorer([ + KeywordRelevanceScorer( + keywords=["news", "breaking"], + weight=1.0 + ), + PathDepthScorer(optimal_depth=3, weight=0.7), + FreshnessScorer(weight=0.9) +]) + +# Initialize advanced strategy +strategy = BFSScraperStrategy( + max_depth=4, + filter_chain=filter_chain, + url_scorer=scorer, + max_concurrent=5 +) +``` + +### Features Demonstrated +1. **Advanced Filtering** + - Domain filtering + - Pattern matching + - Content type control + +2. **Intelligent Scoring** + - Keyword relevance + - Path optimization + - Freshness priority + +3. **Monitoring** + - Progress tracking + - Error handling + - Statistics collection + +4. **Resource Management** + - Concurrent processing + - Rate limiting + - Cleanup handling + +## Running the Examples + +```bash +# Basic usage +python basic_scraper_example.py + +# Advanced usage with logging +PYTHONPATH=. python advanced_scraper_example.py +``` + +## Example Output + +### Basic Example +``` +Crawled 15 pages: +- https://example.com/blog/post1: 24560 bytes +- https://example.com/blog/post2: 18920 bytes +... +``` + +### Advanced Example +``` +INFO: Starting crawl of https://example.com/news/ +INFO: Processed: https://example.com/news/breaking/story1 +DEBUG: KeywordScorer: 0.85 +DEBUG: FreshnessScorer: 0.95 +INFO: Progress: 10 URLs processed +... +INFO: Scraping completed: +INFO: - URLs processed: 50 +INFO: - Errors: 2 +INFO: - Total content size: 1240.50 KB +``` + +## Customization + +### Adding Custom Filters +```python +class CustomFilter(URLFilter): + def apply(self, url: str) -> bool: + # Your custom filtering logic + return True + +filter_chain.add_filter(CustomFilter()) +``` + +### Custom Scoring Logic +```python +class CustomScorer(URLScorer): + def _calculate_score(self, url: str) -> float: + # Your custom scoring logic + return 1.0 + +scorer = CompositeScorer([ + CustomScorer(weight=1.0), + ... +]) +``` + +## Best Practices + +1. **Start Simple** + - Begin with basic filtering + - Add features incrementally + - Test thoroughly at each step + +2. **Monitor Performance** + - Watch memory usage + - Track processing times + - Adjust concurrency as needed + +3. **Handle Errors** + - Implement proper error handling + - Log important events + - Track error statistics + +4. **Optimize Resources** + - Set appropriate delays + - Limit concurrent requests + - Use streaming for large crawls + +## Troubleshooting + +Common issues and solutions: + +1. **Too Many Requests** + ```python + strategy = BFSScraperStrategy( + max_concurrent=3, # Reduce concurrent requests + min_crawl_delay=2 # Increase delay between requests + ) + ``` + +2. **Memory Issues** + ```python + # Use streaming mode for large crawls + async for result in scraper.ascrape(url, stream=True): + process_result(result) + ``` + +3. **Missing Content** + ```python + # Check your filter chain + filter_chain = FilterChain([ + URLPatternFilter("*"), # Broaden patterns + ContentTypeFilter(["*"]) # Accept all content + ]) + ``` + +For more examples and use cases, visit our [GitHub repository](https://github.com/example/crawl4ai/examples). \ No newline at end of file diff --git a/docs/scrapper/web_crawler_quick_start.py b/docs/scrapper/web_crawler_quick_start.py new file mode 100644 index 00000000..99360f42 --- /dev/null +++ b/docs/scrapper/web_crawler_quick_start.py @@ -0,0 +1,111 @@ +import unittest, os +from crawl4ai.web_crawler import WebCrawler +from crawl4ai.chunking_strategy import RegexChunking, FixedLengthWordChunking, SlidingWindowChunking +from crawl4ai.extraction_strategy import CosineStrategy, LLMExtractionStrategy, TopicExtractionStrategy, NoExtractionStrategy + +class TestWebCrawler(unittest.TestCase): + + def setUp(self): + self.crawler = WebCrawler() + + def test_warmup(self): + self.crawler.warmup() + self.assertTrue(self.crawler.ready, "WebCrawler failed to warm up") + + def test_run_default_strategies(self): + result = self.crawler.run( + url='https://www.nbcnews.com/business', + word_count_threshold=5, + chunking_strategy=RegexChunking(), + extraction_strategy=CosineStrategy(), bypass_cache=True + ) + self.assertTrue(result.success, "Failed to crawl and extract using default strategies") + + def test_run_different_strategies(self): + url = 'https://www.nbcnews.com/business' + + # Test with FixedLengthWordChunking and LLMExtractionStrategy + result = self.crawler.run( + url=url, + word_count_threshold=5, + chunking_strategy=FixedLengthWordChunking(chunk_size=100), + extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-3.5-turbo", api_token=os.getenv('OPENAI_API_KEY')), bypass_cache=True + ) + self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and LLMExtractionStrategy") + + # Test with SlidingWindowChunking and TopicExtractionStrategy + result = self.crawler.run( + url=url, + word_count_threshold=5, + chunking_strategy=SlidingWindowChunking(window_size=100, step=50), + extraction_strategy=TopicExtractionStrategy(num_keywords=5), bypass_cache=True + ) + self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and TopicExtractionStrategy") + + def test_invalid_url(self): + with self.assertRaises(Exception) as context: + self.crawler.run(url='invalid_url', bypass_cache=True) + self.assertIn("Invalid URL", str(context.exception)) + + def test_unsupported_extraction_strategy(self): + with self.assertRaises(Exception) as context: + self.crawler.run(url='https://www.nbcnews.com/business', extraction_strategy="UnsupportedStrategy", bypass_cache=True) + self.assertIn("Unsupported extraction strategy", str(context.exception)) + + def test_invalid_css_selector(self): + with self.assertRaises(ValueError) as context: + self.crawler.run(url='https://www.nbcnews.com/business', css_selector="invalid_selector", bypass_cache=True) + self.assertIn("Invalid CSS selector", str(context.exception)) + + + def test_crawl_with_cache_and_bypass_cache(self): + url = 'https://www.nbcnews.com/business' + + # First crawl with cache enabled + result = self.crawler.run(url=url, bypass_cache=False) + self.assertTrue(result.success, "Failed to crawl and cache the result") + + # Second crawl with bypass_cache=True + result = self.crawler.run(url=url, bypass_cache=True) + self.assertTrue(result.success, "Failed to bypass cache and fetch fresh data") + + def test_fetch_multiple_pages(self): + urls = [ + 'https://www.nbcnews.com/business', + 'https://www.bbc.com/news' + ] + results = [] + for url in urls: + result = self.crawler.run( + url=url, + word_count_threshold=5, + chunking_strategy=RegexChunking(), + extraction_strategy=CosineStrategy(), + bypass_cache=True + ) + results.append(result) + + self.assertEqual(len(results), 2, "Failed to crawl and extract multiple pages") + for result in results: + self.assertTrue(result.success, "Failed to crawl and extract a page in the list") + + def test_run_fixed_length_word_chunking_and_no_extraction(self): + result = self.crawler.run( + url='https://www.nbcnews.com/business', + word_count_threshold=5, + chunking_strategy=FixedLengthWordChunking(chunk_size=100), + extraction_strategy=NoExtractionStrategy(), bypass_cache=True + ) + self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and NoExtractionStrategy") + + def test_run_sliding_window_and_no_extraction(self): + result = self.crawler.run( + url='https://www.nbcnews.com/business', + word_count_threshold=5, + chunking_strategy=SlidingWindowChunking(window_size=100, step=50), + extraction_strategy=NoExtractionStrategy(), bypass_cache=True + ) + self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and NoExtractionStrategy") + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_scraper.py b/tests/test_scraper.py new file mode 100644 index 00000000..a2c7a239 --- /dev/null +++ b/tests/test_scraper.py @@ -0,0 +1,184 @@ +# basic_scraper_example.py +from crawl4ai.scraper import ( + AsyncWebScraper, + BFSScraperStrategy, + FilterChain, + URLPatternFilter, + ContentTypeFilter +) +from crawl4ai.async_webcrawler import AsyncWebCrawler + +async def basic_scraper_example(): + """ + Basic example: Scrape a blog site for articles + - Crawls only HTML pages + - Stays within the blog section + - Collects all results at once + """ + # Create a simple filter chain + filter_chain = FilterChain([ + # Only crawl pages within the blog section + URLPatternFilter("*/blog/*"), + # Only process HTML pages + ContentTypeFilter(["text/html"]) + ]) + + # Initialize the strategy with basic configuration + strategy = BFSScraperStrategy( + max_depth=2, # Only go 2 levels deep + filter_chain=filter_chain, + url_scorer=None, # Use default scoring + max_concurrent=3 # Limit concurrent requests + ) + + # Create the crawler and scraper + crawler = AsyncWebCrawler() + scraper = AsyncWebScraper(crawler, strategy) + + # Start scraping + try: + result = await scraper.ascrape("https://example.com/blog/") + + # Process results + print(f"Crawled {len(result.crawled_urls)} pages:") + for url, data in result.extracted_data.items(): + print(f"- {url}: {len(data.html)} bytes") + + except Exception as e: + print(f"Error during scraping: {e}") + +# advanced_scraper_example.py +import logging +from crawl4ai.scraper import ( + AsyncWebScraper, + BFSScraperStrategy, + FilterChain, + URLPatternFilter, + ContentTypeFilter, + DomainFilter, + KeywordRelevanceScorer, + PathDepthScorer, + FreshnessScorer, + CompositeScorer +) +from crawl4ai.async_webcrawler import AsyncWebCrawler + +async def advanced_scraper_example(): + """ + Advanced example: Intelligent news site scraping + - Uses all filter types + - Implements sophisticated scoring + - Streams results + - Includes monitoring and logging + """ + # Set up logging + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger("advanced_scraper") + + # Create sophisticated filter chain + filter_chain = FilterChain([ + # Domain control + DomainFilter( + allowed_domains=["example.com", "blog.example.com"], + blocked_domains=["ads.example.com", "tracker.example.com"] + ), + # URL patterns + URLPatternFilter([ + "*/article/*", + "*/news/*", + "*/blog/*", + re.compile(r"\d{4}/\d{2}/.*") # Date-based URLs + ]), + # Content types + ContentTypeFilter([ + "text/html", + "application/xhtml+xml" + ]) + ]) + + # Create composite scorer + scorer = CompositeScorer([ + # Prioritize by keywords + KeywordRelevanceScorer( + keywords=["news", "breaking", "update", "latest"], + weight=1.0 + ), + # Prefer optimal URL structure + PathDepthScorer( + optimal_depth=3, + weight=0.7 + ), + # Prioritize fresh content + FreshnessScorer(weight=0.9) + ]) + + # Initialize strategy with advanced configuration + strategy = BFSScraperStrategy( + max_depth=4, + filter_chain=filter_chain, + url_scorer=scorer, + max_concurrent=5, + min_crawl_delay=1 + ) + + # Create crawler and scraper + crawler = AsyncWebCrawler() + scraper = AsyncWebScraper(crawler, strategy) + + # Track statistics + stats = { + 'processed': 0, + 'errors': 0, + 'total_size': 0 + } + + try: + # Use streaming mode + async for result in scraper.ascrape("https://example.com/news/", stream=True): + stats['processed'] += 1 + + if result.success: + stats['total_size'] += len(result.html) + logger.info(f"Processed: {result.url}") + + # Print scoring information + for scorer_name, score in result.scores.items(): + logger.debug(f"{scorer_name}: {score:.2f}") + else: + stats['errors'] += 1 + logger.error(f"Failed to process {result.url}: {result.error_message}") + + # Log progress regularly + if stats['processed'] % 10 == 0: + logger.info(f"Progress: {stats['processed']} URLs processed") + + except Exception as e: + logger.error(f"Scraping error: {e}") + + finally: + # Print final statistics + logger.info("Scraping completed:") + logger.info(f"- URLs processed: {stats['processed']}") + logger.info(f"- Errors: {stats['errors']}") + logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB") + + # Print filter statistics + for filter_ in filter_chain.filters: + logger.info(f"{filter_.name} stats:") + logger.info(f"- Passed: {filter_.stats.passed_urls}") + logger.info(f"- Rejected: {filter_.stats.rejected_urls}") + + # Print scorer statistics + logger.info("Scoring statistics:") + logger.info(f"- Average score: {scorer.stats.average_score:.2f}") + logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}") + +if __name__ == "__main__": + import asyncio + + # Run basic example + print("Running basic scraper example...") + asyncio.run(basic_scraper_example()) + + print("\nRunning advanced scraper example...") + asyncio.run(advanced_scraper_example()) \ No newline at end of file From 0d357ab7d2659596442095f81f36330c9da7627c Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 8 Nov 2024 19:02:28 +0800 Subject: [PATCH 16/28] feat(scraper): Enhance URL filtering and scoring systems Implement comprehensive URL filtering and scoring capabilities: Filters: - Add URLPatternFilter with glob/regex support - Implement ContentTypeFilter with MIME type checking - Add DomainFilter for domain control - Create FilterChain with stats tracking Scorers: - Complete KeywordRelevanceScorer implementation - Add PathDepthScorer for URL structure scoring - Implement ContentTypeScorer for file type priorities - Add FreshnessScorer for date-based scoring - Add DomainAuthorityScorer for domain weighting - Create CompositeScorer for combined strategies Features: - Add statistics tracking for both filters and scorers - Implement logging support throughout - Add resource cleanup methods - Create comprehensive documentation - Include performance optimizations Tests and docs included. Note: Review URL normalization overlap with recent crawler changes. --- crawl4ai/scraper/__init__.py | 3 +- .../{filters/__init__.py => filters.py} | 0 .../scraper/filters/content_type_filter.py | 43 ---- crawl4ai/scraper/filters/url_filter.py | 72 ------- .../scraper/filters/url_pattern_filter.py | 39 ---- .../{scorers/__init__.py => scorers.py} | 0 .../scorers/keyword_relevance_scorer.py | 9 - crawl4ai/scraper/scorers/url_scorer.py | 6 - docs/scrapper/scraper_quickstart.py | 184 ++++++++++++++++++ docs/scrapper/web_crawler_quick_start.py | 111 ----------- 10 files changed, 186 insertions(+), 281 deletions(-) rename crawl4ai/scraper/{filters/__init__.py => filters.py} (100%) delete mode 100644 crawl4ai/scraper/filters/content_type_filter.py delete mode 100644 crawl4ai/scraper/filters/url_filter.py delete mode 100644 crawl4ai/scraper/filters/url_pattern_filter.py rename crawl4ai/scraper/{scorers/__init__.py => scorers.py} (100%) delete mode 100644 crawl4ai/scraper/scorers/keyword_relevance_scorer.py delete mode 100644 crawl4ai/scraper/scorers/url_scorer.py create mode 100644 docs/scrapper/scraper_quickstart.py delete mode 100644 docs/scrapper/web_crawler_quick_start.py diff --git a/crawl4ai/scraper/__init__.py b/crawl4ai/scraper/__init__.py index 1997e162..1138a917 100644 --- a/crawl4ai/scraper/__init__.py +++ b/crawl4ai/scraper/__init__.py @@ -1,2 +1,3 @@ from .async_web_scraper import AsyncWebScraper -from .bfs_scraper_strategy import BFSScraperStrategy \ No newline at end of file +from .bfs_scraper_strategy import BFSScraperStrategy +from .filters import URLFilter, FilterChain, URLPatternFilter, ContentTypeFilter \ No newline at end of file diff --git a/crawl4ai/scraper/filters/__init__.py b/crawl4ai/scraper/filters.py similarity index 100% rename from crawl4ai/scraper/filters/__init__.py rename to crawl4ai/scraper/filters.py diff --git a/crawl4ai/scraper/filters/content_type_filter.py b/crawl4ai/scraper/filters/content_type_filter.py deleted file mode 100644 index 6966afdb..00000000 --- a/crawl4ai/scraper/filters/content_type_filter.py +++ /dev/null @@ -1,43 +0,0 @@ -from .url_filter import URLFilter -from typing import List, Union -from urllib.parse import urlparse -import mimetypes - - -class ContentTypeFilter(URLFilter): - """Filter URLs based on expected content type""" - - def __init__(self, allowed_types: Union[str, List[str]], - check_extension: bool = True): - super().__init__() - self.allowed_types = [allowed_types] if isinstance(allowed_types, str) else allowed_types - self.check_extension = check_extension - self._normalize_types() - - def _normalize_types(self): - """Normalize content type strings""" - self.allowed_types = [t.lower() for t in self.allowed_types] - - def _check_extension(self, url: str) -> bool: - """Check URL's file extension""" - ext = urlparse(url).path.split('.')[-1].lower() if '.' in urlparse(url).path else '' - if not ext: - return True # No extension, might be dynamic content - - guessed_type = mimetypes.guess_type(url)[0] - return any(allowed in (guessed_type or '').lower() for allowed in self.allowed_types) - - def apply(self, url: str) -> bool: - """Check if URL's content type is allowed""" - result = True - if self.check_extension: - result = self._check_extension(url) - self._update_stats(result) - return result - -# class ContentTypeFilter(URLFilter): -# def __init__(self, contentType: str): -# self.contentType = contentType -# def apply(self, url: str) -> bool: -# #TODO: This is a stub. Will implement this later -# return True \ No newline at end of file diff --git a/crawl4ai/scraper/filters/url_filter.py b/crawl4ai/scraper/filters/url_filter.py deleted file mode 100644 index 88a2c60a..00000000 --- a/crawl4ai/scraper/filters/url_filter.py +++ /dev/null @@ -1,72 +0,0 @@ -from abc import ABC, abstractmethod -from dataclasses import dataclass -import logging -from typing import List -@dataclass -class FilterStats: - """Statistics for filter applications""" - total_urls: int = 0 - rejected_urls: int = 0 - passed_urls: int = 0 - -class URLFilter(ABC): - """Base class for URL filters""" - - def __init__(self, name: str = None): - self.name = name or self.__class__.__name__ - self.stats = FilterStats() - self.logger = logging.getLogger(f"urlfilter.{self.name}") - - @abstractmethod - def apply(self, url: str) -> bool: - """Apply the filter to a URL""" - pass - - def _update_stats(self, passed: bool): - """Update filter statistics""" - self.stats.total_urls += 1 - if passed: - self.stats.passed_urls += 1 - else: - self.stats.rejected_urls += 1 - -class FilterChain: - """Chain of URL filters""" - - def __init__(self, filters: List[URLFilter] = None): - self.filters = filters or [] - self.stats = FilterStats() - self.logger = logging.getLogger("urlfilter.chain") - - def add_filter(self, filter_: URLFilter) -> 'FilterChain': - """Add a filter to the chain""" - self.filters.append(filter_) - return self # Enable method chaining - - def apply(self, url: str) -> bool: - """Apply all filters in the chain""" - self.stats.total_urls += 1 - - for filter_ in self.filters: - if not filter_.apply(url): - self.stats.rejected_urls += 1 - self.logger.debug(f"URL {url} rejected by {filter_.name}") - return False - - self.stats.passed_urls += 1 - return True - -# class URLFilter(ABC): -# @abstractmethod -# def apply(self, url: str) -> bool: -# pass - -# class FilterChain: -# def __init__(self): -# self.filters = [] - -# def add_filter(self, filter: URLFilter): -# self.filters.append(filter) - -# def apply(self, url: str) -> bool: -# return all(filter.apply(url) for filter in self.filters) \ No newline at end of file diff --git a/crawl4ai/scraper/filters/url_pattern_filter.py b/crawl4ai/scraper/filters/url_pattern_filter.py deleted file mode 100644 index 1e02b4a6..00000000 --- a/crawl4ai/scraper/filters/url_pattern_filter.py +++ /dev/null @@ -1,39 +0,0 @@ -from .url_filter import URLFilter -from re import Pattern -from typing import List, Union -import re -import fnmatch - - -class URLPatternFilter(URLFilter): - """Filter URLs based on glob patterns or regex""" - - def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], - use_glob: bool = True): - super().__init__() - self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns - self.use_glob = use_glob - self._compiled_patterns = [] - - for pattern in self.patterns: - if isinstance(pattern, str) and use_glob: - self._compiled_patterns.append(self._glob_to_regex(pattern)) - else: - self._compiled_patterns.append(re.compile(pattern) if isinstance(pattern, str) else pattern) - - def _glob_to_regex(self, pattern: str) -> Pattern: - """Convert glob pattern to regex""" - return re.compile(fnmatch.translate(pattern)) - - def apply(self, url: str) -> bool: - """Check if URL matches any of the patterns""" - matches = any(pattern.search(url) for pattern in self._compiled_patterns) - self._update_stats(matches) - return matches - -# class URLPatternFilter(URLFilter): -# def __init__(self, pattern: Pattern): -# self.pattern = pattern -# def apply(self, url: str) -> bool: -# #TODO: This is a stub. Will implement this later. -# return True \ No newline at end of file diff --git a/crawl4ai/scraper/scorers/__init__.py b/crawl4ai/scraper/scorers.py similarity index 100% rename from crawl4ai/scraper/scorers/__init__.py rename to crawl4ai/scraper/scorers.py diff --git a/crawl4ai/scraper/scorers/keyword_relevance_scorer.py b/crawl4ai/scraper/scorers/keyword_relevance_scorer.py deleted file mode 100644 index a2338aec..00000000 --- a/crawl4ai/scraper/scorers/keyword_relevance_scorer.py +++ /dev/null @@ -1,9 +0,0 @@ -from .url_scorer import URLScorer -from typing import List - -class KeywordRelevanceScorer(URLScorer): - def __init__(self,keywords: List[str]): - self.keyworkds = keywords - def score(self, url: str) -> float: - #TODO: This is a stub. Will implement this later. - return 1 \ No newline at end of file diff --git a/crawl4ai/scraper/scorers/url_scorer.py b/crawl4ai/scraper/scorers/url_scorer.py deleted file mode 100644 index 6ee9ab05..00000000 --- a/crawl4ai/scraper/scorers/url_scorer.py +++ /dev/null @@ -1,6 +0,0 @@ -from abc import ABC, abstractmethod - -class URLScorer(ABC): - @abstractmethod - def score(self, url: str) -> float: - pass \ No newline at end of file diff --git a/docs/scrapper/scraper_quickstart.py b/docs/scrapper/scraper_quickstart.py new file mode 100644 index 00000000..a2c7a239 --- /dev/null +++ b/docs/scrapper/scraper_quickstart.py @@ -0,0 +1,184 @@ +# basic_scraper_example.py +from crawl4ai.scraper import ( + AsyncWebScraper, + BFSScraperStrategy, + FilterChain, + URLPatternFilter, + ContentTypeFilter +) +from crawl4ai.async_webcrawler import AsyncWebCrawler + +async def basic_scraper_example(): + """ + Basic example: Scrape a blog site for articles + - Crawls only HTML pages + - Stays within the blog section + - Collects all results at once + """ + # Create a simple filter chain + filter_chain = FilterChain([ + # Only crawl pages within the blog section + URLPatternFilter("*/blog/*"), + # Only process HTML pages + ContentTypeFilter(["text/html"]) + ]) + + # Initialize the strategy with basic configuration + strategy = BFSScraperStrategy( + max_depth=2, # Only go 2 levels deep + filter_chain=filter_chain, + url_scorer=None, # Use default scoring + max_concurrent=3 # Limit concurrent requests + ) + + # Create the crawler and scraper + crawler = AsyncWebCrawler() + scraper = AsyncWebScraper(crawler, strategy) + + # Start scraping + try: + result = await scraper.ascrape("https://example.com/blog/") + + # Process results + print(f"Crawled {len(result.crawled_urls)} pages:") + for url, data in result.extracted_data.items(): + print(f"- {url}: {len(data.html)} bytes") + + except Exception as e: + print(f"Error during scraping: {e}") + +# advanced_scraper_example.py +import logging +from crawl4ai.scraper import ( + AsyncWebScraper, + BFSScraperStrategy, + FilterChain, + URLPatternFilter, + ContentTypeFilter, + DomainFilter, + KeywordRelevanceScorer, + PathDepthScorer, + FreshnessScorer, + CompositeScorer +) +from crawl4ai.async_webcrawler import AsyncWebCrawler + +async def advanced_scraper_example(): + """ + Advanced example: Intelligent news site scraping + - Uses all filter types + - Implements sophisticated scoring + - Streams results + - Includes monitoring and logging + """ + # Set up logging + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger("advanced_scraper") + + # Create sophisticated filter chain + filter_chain = FilterChain([ + # Domain control + DomainFilter( + allowed_domains=["example.com", "blog.example.com"], + blocked_domains=["ads.example.com", "tracker.example.com"] + ), + # URL patterns + URLPatternFilter([ + "*/article/*", + "*/news/*", + "*/blog/*", + re.compile(r"\d{4}/\d{2}/.*") # Date-based URLs + ]), + # Content types + ContentTypeFilter([ + "text/html", + "application/xhtml+xml" + ]) + ]) + + # Create composite scorer + scorer = CompositeScorer([ + # Prioritize by keywords + KeywordRelevanceScorer( + keywords=["news", "breaking", "update", "latest"], + weight=1.0 + ), + # Prefer optimal URL structure + PathDepthScorer( + optimal_depth=3, + weight=0.7 + ), + # Prioritize fresh content + FreshnessScorer(weight=0.9) + ]) + + # Initialize strategy with advanced configuration + strategy = BFSScraperStrategy( + max_depth=4, + filter_chain=filter_chain, + url_scorer=scorer, + max_concurrent=5, + min_crawl_delay=1 + ) + + # Create crawler and scraper + crawler = AsyncWebCrawler() + scraper = AsyncWebScraper(crawler, strategy) + + # Track statistics + stats = { + 'processed': 0, + 'errors': 0, + 'total_size': 0 + } + + try: + # Use streaming mode + async for result in scraper.ascrape("https://example.com/news/", stream=True): + stats['processed'] += 1 + + if result.success: + stats['total_size'] += len(result.html) + logger.info(f"Processed: {result.url}") + + # Print scoring information + for scorer_name, score in result.scores.items(): + logger.debug(f"{scorer_name}: {score:.2f}") + else: + stats['errors'] += 1 + logger.error(f"Failed to process {result.url}: {result.error_message}") + + # Log progress regularly + if stats['processed'] % 10 == 0: + logger.info(f"Progress: {stats['processed']} URLs processed") + + except Exception as e: + logger.error(f"Scraping error: {e}") + + finally: + # Print final statistics + logger.info("Scraping completed:") + logger.info(f"- URLs processed: {stats['processed']}") + logger.info(f"- Errors: {stats['errors']}") + logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB") + + # Print filter statistics + for filter_ in filter_chain.filters: + logger.info(f"{filter_.name} stats:") + logger.info(f"- Passed: {filter_.stats.passed_urls}") + logger.info(f"- Rejected: {filter_.stats.rejected_urls}") + + # Print scorer statistics + logger.info("Scoring statistics:") + logger.info(f"- Average score: {scorer.stats.average_score:.2f}") + logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}") + +if __name__ == "__main__": + import asyncio + + # Run basic example + print("Running basic scraper example...") + asyncio.run(basic_scraper_example()) + + print("\nRunning advanced scraper example...") + asyncio.run(advanced_scraper_example()) \ No newline at end of file diff --git a/docs/scrapper/web_crawler_quick_start.py b/docs/scrapper/web_crawler_quick_start.py deleted file mode 100644 index 99360f42..00000000 --- a/docs/scrapper/web_crawler_quick_start.py +++ /dev/null @@ -1,111 +0,0 @@ -import unittest, os -from crawl4ai.web_crawler import WebCrawler -from crawl4ai.chunking_strategy import RegexChunking, FixedLengthWordChunking, SlidingWindowChunking -from crawl4ai.extraction_strategy import CosineStrategy, LLMExtractionStrategy, TopicExtractionStrategy, NoExtractionStrategy - -class TestWebCrawler(unittest.TestCase): - - def setUp(self): - self.crawler = WebCrawler() - - def test_warmup(self): - self.crawler.warmup() - self.assertTrue(self.crawler.ready, "WebCrawler failed to warm up") - - def test_run_default_strategies(self): - result = self.crawler.run( - url='https://www.nbcnews.com/business', - word_count_threshold=5, - chunking_strategy=RegexChunking(), - extraction_strategy=CosineStrategy(), bypass_cache=True - ) - self.assertTrue(result.success, "Failed to crawl and extract using default strategies") - - def test_run_different_strategies(self): - url = 'https://www.nbcnews.com/business' - - # Test with FixedLengthWordChunking and LLMExtractionStrategy - result = self.crawler.run( - url=url, - word_count_threshold=5, - chunking_strategy=FixedLengthWordChunking(chunk_size=100), - extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-3.5-turbo", api_token=os.getenv('OPENAI_API_KEY')), bypass_cache=True - ) - self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and LLMExtractionStrategy") - - # Test with SlidingWindowChunking and TopicExtractionStrategy - result = self.crawler.run( - url=url, - word_count_threshold=5, - chunking_strategy=SlidingWindowChunking(window_size=100, step=50), - extraction_strategy=TopicExtractionStrategy(num_keywords=5), bypass_cache=True - ) - self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and TopicExtractionStrategy") - - def test_invalid_url(self): - with self.assertRaises(Exception) as context: - self.crawler.run(url='invalid_url', bypass_cache=True) - self.assertIn("Invalid URL", str(context.exception)) - - def test_unsupported_extraction_strategy(self): - with self.assertRaises(Exception) as context: - self.crawler.run(url='https://www.nbcnews.com/business', extraction_strategy="UnsupportedStrategy", bypass_cache=True) - self.assertIn("Unsupported extraction strategy", str(context.exception)) - - def test_invalid_css_selector(self): - with self.assertRaises(ValueError) as context: - self.crawler.run(url='https://www.nbcnews.com/business', css_selector="invalid_selector", bypass_cache=True) - self.assertIn("Invalid CSS selector", str(context.exception)) - - - def test_crawl_with_cache_and_bypass_cache(self): - url = 'https://www.nbcnews.com/business' - - # First crawl with cache enabled - result = self.crawler.run(url=url, bypass_cache=False) - self.assertTrue(result.success, "Failed to crawl and cache the result") - - # Second crawl with bypass_cache=True - result = self.crawler.run(url=url, bypass_cache=True) - self.assertTrue(result.success, "Failed to bypass cache and fetch fresh data") - - def test_fetch_multiple_pages(self): - urls = [ - 'https://www.nbcnews.com/business', - 'https://www.bbc.com/news' - ] - results = [] - for url in urls: - result = self.crawler.run( - url=url, - word_count_threshold=5, - chunking_strategy=RegexChunking(), - extraction_strategy=CosineStrategy(), - bypass_cache=True - ) - results.append(result) - - self.assertEqual(len(results), 2, "Failed to crawl and extract multiple pages") - for result in results: - self.assertTrue(result.success, "Failed to crawl and extract a page in the list") - - def test_run_fixed_length_word_chunking_and_no_extraction(self): - result = self.crawler.run( - url='https://www.nbcnews.com/business', - word_count_threshold=5, - chunking_strategy=FixedLengthWordChunking(chunk_size=100), - extraction_strategy=NoExtractionStrategy(), bypass_cache=True - ) - self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and NoExtractionStrategy") - - def test_run_sliding_window_and_no_extraction(self): - result = self.crawler.run( - url='https://www.nbcnews.com/business', - word_count_threshold=5, - chunking_strategy=SlidingWindowChunking(window_size=100, step=50), - extraction_strategy=NoExtractionStrategy(), bypass_cache=True - ) - self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and NoExtractionStrategy") - -if __name__ == '__main__': - unittest.main() From c1797037c02a3d26cd8e71fc3ba088c3a919c6cd Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Sat, 23 Nov 2024 12:39:25 +0530 Subject: [PATCH 17/28] Fixed a few bugs, import errors and changed to asyncio wait_for instead of timeout to support python versions < 3.11 --- crawl4ai/scraper/__init__.py | 4 ++- crawl4ai/scraper/bfs_scraper_strategy.py | 41 ++++++------------------ crawl4ai/scraper/scraper_strategy.py | 18 ++++++++++- 3 files changed, 30 insertions(+), 33 deletions(-) diff --git a/crawl4ai/scraper/__init__.py b/crawl4ai/scraper/__init__.py index 1138a917..5af7ad6b 100644 --- a/crawl4ai/scraper/__init__.py +++ b/crawl4ai/scraper/__init__.py @@ -1,3 +1,5 @@ from .async_web_scraper import AsyncWebScraper from .bfs_scraper_strategy import BFSScraperStrategy -from .filters import URLFilter, FilterChain, URLPatternFilter, ContentTypeFilter \ No newline at end of file +from .filters import URLFilter, FilterChain, URLPatternFilter, ContentTypeFilter, DomainFilter +from .scorers import KeywordRelevanceScorer, PathDepthScorer, FreshnessScorer, CompositeScorer +from .scraper_strategy import ScraperStrategy \ No newline at end of file diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index 72935008..cae7ba90 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -16,6 +16,7 @@ from .models import ScraperResult, CrawlResult from .filters import FilterChain from .scorers import URLScorer from ..async_webcrawler import AsyncWebCrawler +from .scraper_strategy import ScraperStrategy @dataclass class CrawlStats: @@ -28,30 +29,6 @@ class CrawlStats: current_depth: int = 0 robots_blocked: int = 0 -class ScraperStrategy(ABC): - """Base class for scraping strategies""" - - @abstractmethod - async def ascrape( - self, - url: str, - crawler: AsyncWebCrawler, - parallel_processing: bool = True, - stream: bool = False - ) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: - """Abstract method for scraping implementation""" - pass - - @abstractmethod - async def can_process_url(self, url: str) -> bool: - """Check if URL can be processed based on strategy rules""" - pass - - @abstractmethod - async def shutdown(self): - """Clean up resources used by the strategy""" - pass - class BFSScraperStrategy(ScraperStrategy): """Breadth-First Search scraping strategy with politeness controls""" @@ -135,11 +112,15 @@ class BFSScraperStrategy(ScraperStrategy): ) -> CrawlResult: """Crawl URL with retry logic""" try: - async with asyncio.timeout(self.timeout): - return await crawler.arun(url) + return await asyncio.wait_for(crawler.arun(url), timeout=self.timeout) except asyncio.TimeoutError: self.logger.error(f"Timeout crawling {url}") raise + except Exception as e: + # Catch any other exceptions that may cause retries + self.logger.error(f"Error crawling {url}: {e}") + raise + async def process_url( self, @@ -181,16 +162,14 @@ class BFSScraperStrategy(ScraperStrategy): async with self.rate_limiter: result = await self._crawl_with_retry(crawler, url) self.stats.urls_processed += 1 + # Process links + await self._process_links(result, url, depth, queue, visited, depths) + return result except Exception as e: self.logger.error(f"Error crawling {url}: {e}") self.stats.urls_failed += 1 return None - # Process links - await self._process_links(result, url, depth, queue, visited, depths) - - return result - async def _process_links( self, result: CrawlResult, diff --git a/crawl4ai/scraper/scraper_strategy.py b/crawl4ai/scraper/scraper_strategy.py index e4872de7..f1588f0c 100644 --- a/crawl4ai/scraper/scraper_strategy.py +++ b/crawl4ai/scraper/scraper_strategy.py @@ -6,7 +6,13 @@ from typing import Union, AsyncGenerator class ScraperStrategy(ABC): @abstractmethod - async def ascrape(self, url: str, crawler: AsyncWebCrawler, parallel_processing: bool = True, stream: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: + async def ascrape( + self, + url: str, + crawler: AsyncWebCrawler, + parallel_processing: bool = True, + stream: bool = False + ) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: """Scrape the given URL using the specified crawler. Args: @@ -23,4 +29,14 @@ class ScraperStrategy(ABC): ScraperResult: A summary of the scrape results containing the final extracted data and the list of crawled URLs if stream is False. """ + pass + + @abstractmethod + async def can_process_url(self, url: str) -> bool: + """Check if URL can be processed based on strategy rules""" + pass + + @abstractmethod + async def shutdown(self): + """Clean up resources used by the strategy""" pass \ No newline at end of file From f8e85b149939eea296805a680c080bbac884f269 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Sat, 23 Nov 2024 13:52:34 +0530 Subject: [PATCH 18/28] Fixed a bug in _process_links, handled condition for when url_scorer is passed as None, renamed the scrapper folder to scraper. --- crawl4ai/scraper/bfs_scraper_strategy.py | 34 +++++++++++-------- .../async_web_scraper.md | 0 .../bfs_scraper_strategy.md | 0 .../{scrapper => scraper}/filters_scrorers.md | 0 docs/{scrapper => scraper}/how_to_use.md | 0 .../scraper_quickstart.py | 32 ++++++++--------- 6 files changed, 35 insertions(+), 31 deletions(-) rename docs/{scrapper => scraper}/async_web_scraper.md (100%) rename docs/{scrapper => scraper}/bfs_scraper_strategy.md (100%) rename docs/{scrapper => scraper}/filters_scrorers.md (100%) rename docs/{scrapper => scraper}/how_to_use.md (100%) rename docs/{scrapper => scraper}/scraper_quickstart.py (88%) diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index cae7ba90..72a86203 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -192,22 +192,26 @@ class BFSScraperStrategy(ScraperStrategy): links_ro_process = result.links["internal"] if self.process_external_links: links_ro_process += result.links["external"] - for link_type in links_ro_process: - for link in result.links[link_type]: - url = link['href'] - # url = urljoin(source_url, link['href']) - # url = urlunparse(urlparse(url)._replace(fragment="")) - - if url not in visited and await self.can_process_url(url): - new_depth = depths[source_url] + 1 - if new_depth <= self.max_depth: + for link in links_ro_process: + url = link['href'] + # url = urljoin(source_url, link['href']) + # url = urlunparse(urlparse(url)._replace(fragment="")) + + if url not in visited and await self.can_process_url(url): + new_depth = depths[source_url] + 1 + if new_depth <= self.max_depth: + if self.url_scorer: score = self.url_scorer.score(url) - await queue.put((score, new_depth, url)) - depths[url] = new_depth - self.stats.total_depth_reached = max( - self.stats.total_depth_reached, - new_depth - ) + else: + # When no url_scorer is provided all urls will have same score of 0. + # Therefore will be process in FIFO order as per URL depth + score = 0 + await queue.put((score, new_depth, url)) + depths[url] = new_depth + self.stats.total_depth_reached = max( + self.stats.total_depth_reached, + new_depth + ) async def ascrape( self, diff --git a/docs/scrapper/async_web_scraper.md b/docs/scraper/async_web_scraper.md similarity index 100% rename from docs/scrapper/async_web_scraper.md rename to docs/scraper/async_web_scraper.md diff --git a/docs/scrapper/bfs_scraper_strategy.md b/docs/scraper/bfs_scraper_strategy.md similarity index 100% rename from docs/scrapper/bfs_scraper_strategy.md rename to docs/scraper/bfs_scraper_strategy.md diff --git a/docs/scrapper/filters_scrorers.md b/docs/scraper/filters_scrorers.md similarity index 100% rename from docs/scrapper/filters_scrorers.md rename to docs/scraper/filters_scrorers.md diff --git a/docs/scrapper/how_to_use.md b/docs/scraper/how_to_use.md similarity index 100% rename from docs/scrapper/how_to_use.md rename to docs/scraper/how_to_use.md diff --git a/docs/scrapper/scraper_quickstart.py b/docs/scraper/scraper_quickstart.py similarity index 88% rename from docs/scrapper/scraper_quickstart.py rename to docs/scraper/scraper_quickstart.py index a2c7a239..811f997e 100644 --- a/docs/scrapper/scraper_quickstart.py +++ b/docs/scraper/scraper_quickstart.py @@ -7,6 +7,7 @@ from crawl4ai.scraper import ( ContentTypeFilter ) from crawl4ai.async_webcrawler import AsyncWebCrawler +import re async def basic_scraper_example(): """ @@ -18,7 +19,7 @@ async def basic_scraper_example(): # Create a simple filter chain filter_chain = FilterChain([ # Only crawl pages within the blog section - URLPatternFilter("*/blog/*"), + # URLPatternFilter("*/tutorial/*"), # Only process HTML pages ContentTypeFilter(["text/html"]) ]) @@ -32,20 +33,19 @@ async def basic_scraper_example(): ) # Create the crawler and scraper - crawler = AsyncWebCrawler() - scraper = AsyncWebScraper(crawler, strategy) - - # Start scraping - try: - result = await scraper.ascrape("https://example.com/blog/") - - # Process results - print(f"Crawled {len(result.crawled_urls)} pages:") - for url, data in result.extracted_data.items(): - print(f"- {url}: {len(data.html)} bytes") + async with AsyncWebCrawler(verbose=True) as crawler: + scraper = AsyncWebScraper(crawler, strategy) + # Start scraping + try: + result = await scraper.ascrape("https://crawl4ai.com/mkdocs") - except Exception as e: - print(f"Error during scraping: {e}") + # Process results + print(f"Crawled {len(result.crawled_urls)} pages:") + for url, data in result.extracted_data.items(): + print(f"- {url}: {len(data.html)} bytes") + + except Exception as e: + print(f"Error during scraping: {e}") # advanced_scraper_example.py import logging @@ -180,5 +180,5 @@ if __name__ == "__main__": print("Running basic scraper example...") asyncio.run(basic_scraper_example()) - print("\nRunning advanced scraper example...") - asyncio.run(advanced_scraper_example()) \ No newline at end of file + # print("\nRunning advanced scraper example...") + # asyncio.run(advanced_scraper_example()) \ No newline at end of file From 2226ef53c84fe116a5cc6c15f1cd0fab178715f9 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Sat, 23 Nov 2024 14:59:14 +0530 Subject: [PATCH 19/28] fix: Exempting the start_url from can_process_url --- crawl4ai/scraper/bfs_scraper_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index 72a86203..b87412ef 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -146,7 +146,7 @@ class BFSScraperStrategy(ScraperStrategy): if self._cancel_event.is_set(): return None - if not await self.can_process_url(url): + if depth!=0 and not await self.can_process_url(url): self.stats.urls_skipped += 1 return None From b13fd71040184851c367bd287d07772459eeb07a Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Tue, 26 Nov 2024 10:07:11 +0530 Subject: [PATCH 20/28] chore: 1. Expose process_external_links as a param 2. Removed a few unused imports 3. Removed URL normalisation for external links separately as that won't be necessary --- crawl4ai/scraper/bfs_scraper_strategy.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index b87412ef..c12bf42e 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -1,10 +1,9 @@ -from abc import ABC, abstractmethod -from typing import Union, AsyncGenerator, Optional, Dict, Set +from typing import AsyncGenerator, Optional, Dict, Set from dataclasses import dataclass from datetime import datetime import asyncio import logging -from urllib.parse import urljoin, urlparse, urlunparse +from urllib.parse import urlparse from urllib.robotparser import RobotFileParser import validators import time @@ -12,7 +11,7 @@ from aiolimiter import AsyncLimiter from tenacity import retry, stop_after_attempt, wait_exponential from collections import defaultdict -from .models import ScraperResult, CrawlResult +from .models import CrawlResult from .filters import FilterChain from .scorers import URLScorer from ..async_webcrawler import AsyncWebCrawler @@ -37,6 +36,7 @@ class BFSScraperStrategy(ScraperStrategy): max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer, + process_external_links: bool = False, max_concurrent: int = 5, min_crawl_delay: int = 1, timeout: int = 30, @@ -53,7 +53,7 @@ class BFSScraperStrategy(ScraperStrategy): # Crawl control self.stats = CrawlStats(start_time=datetime.now()) self._cancel_event = asyncio.Event() - self.process_external_links = False + self.process_external_links = process_external_links # Rate limiting and politeness self.rate_limiter = AsyncLimiter(1, 1) @@ -189,14 +189,11 @@ class BFSScraperStrategy(ScraperStrategy): Adds valid URLs to the queue Updates maximum depth statistics """ - links_ro_process = result.links["internal"] + links_to_process = result.links["internal"] if self.process_external_links: - links_ro_process += result.links["external"] - for link in links_ro_process: + links_to_process += result.links["external"] + for link in links_to_process: url = link['href'] - # url = urljoin(source_url, link['href']) - # url = urlunparse(urlparse(url)._replace(fragment="")) - if url not in visited and await self.can_process_url(url): new_depth = depths[source_url] + 1 if new_depth <= self.max_depth: From ee3001b1f7c0c8200820e0f3e2334967ea7744da Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Tue, 26 Nov 2024 10:22:14 +0530 Subject: [PATCH 21/28] fix: moved depth as a param to can_process_url and applying filter chain only when depth is not zero. This way filter chain is skipped but other validations are in place even for start URL --- crawl4ai/scraper/bfs_scraper_strategy.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index c12bf42e..3109e96d 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -61,7 +61,7 @@ class BFSScraperStrategy(ScraperStrategy): self.robot_parsers: Dict[str, RobotFileParser] = {} self.domain_queues: Dict[str, asyncio.Queue] = defaultdict(asyncio.Queue) - async def can_process_url(self, url: str) -> bool: + async def can_process_url(self, url: str, depth: int) -> bool: """Check if URL can be processed based on robots.txt and filters This is our gatekeeper method that determines if a URL should be processed. It: - Validates URL format using the validators library @@ -80,7 +80,11 @@ class BFSScraperStrategy(ScraperStrategy): self.logger.info(f"Blocked by robots.txt: {url}") return False - return self.filter_chain.apply(url) + # Apply the filter chain it's not start page + if depth != 0 and not self.filter_chain.apply(url): + return False + + return True async def _get_robot_parser(self, url: str) -> Optional[RobotFileParser]: """Get or create robots.txt parser for domain. @@ -146,7 +150,7 @@ class BFSScraperStrategy(ScraperStrategy): if self._cancel_event.is_set(): return None - if depth!=0 and not await self.can_process_url(url): + if not await self.can_process_url(url, depth): self.stats.urls_skipped += 1 return None @@ -194,7 +198,7 @@ class BFSScraperStrategy(ScraperStrategy): links_to_process += result.links["external"] for link in links_to_process: url = link['href'] - if url not in visited and await self.can_process_url(url): + if url not in visited and await self.can_process_url(url,depth): new_depth = depths[source_url] + 1 if new_depth <= self.max_depth: if self.url_scorer: From a98d51a62cd9f1ee27bf9459c88f73b1b7956d37 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Tue, 26 Nov 2024 11:11:49 +0530 Subject: [PATCH 22/28] Remove the can_process_url check from _process_links since it's already being checked in process_url --- crawl4ai/scraper/bfs_scraper_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index 3109e96d..3a6d09a5 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -198,7 +198,7 @@ class BFSScraperStrategy(ScraperStrategy): links_to_process += result.links["external"] for link in links_to_process: url = link['href'] - if url not in visited and await self.can_process_url(url,depth): + if url not in visited: new_depth = depths[source_url] + 1 if new_depth <= self.max_depth: if self.url_scorer: From a888c91790e8234179f8253f6bbb66dcf401bbca Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Tue, 26 Nov 2024 14:05:02 +0530 Subject: [PATCH 23/28] Fix "Future attached to a different loop" error by ensuring tasks are created in the correct event loop - Explicitly retrieve and use the correct event loop when creating tasks to avoid cross-loop issues. - Ensures proper task scheduling in environments with multiple event loops. --- crawl4ai/async_crawler_strategy.py | 3 ++- crawl4ai/scraper/bfs_scraper_strategy.py | 3 ++- main.py | 6 ++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 3f332eb0..df23c43e 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -68,7 +68,8 @@ class ManagedBrowser: stderr=subprocess.PIPE ) # Monitor browser process output for errors - asyncio.create_task(self._monitor_browser_process()) + loop = asyncio.get_event_loop() + loop.create_task(self._monitor_browser_process()) await asyncio.sleep(2) # Give browser time to start return f"http://localhost:{self.debugging_port}" except Exception as e: diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index 3a6d09a5..73a4f8ae 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -264,7 +264,8 @@ class BFSScraperStrategy(ScraperStrategy): self.stats.current_depth = depth if parallel_processing: - task = asyncio.create_task( + loop = asyncio.get_event_loop() + task = loop.create_task( self.process_url(url, depth, crawler, queue, visited, depths) ) pending_tasks.add(task) diff --git a/main.py b/main.py index 6d217410..bc5dfe7e 100644 --- a/main.py +++ b/main.py @@ -125,7 +125,8 @@ class TaskManager: self.cleanup_task = None async def start(self): - self.cleanup_task = asyncio.create_task(self._cleanup_loop()) + loop = asyncio.get_event_loop() + self.cleanup_task = loop.create_task(self._cleanup_loop()) async def stop(self): if self.cleanup_task: @@ -231,7 +232,8 @@ class CrawlerService: async def start(self): await self.task_manager.start() - self._processing_task = asyncio.create_task(self._process_queue()) + loop = asyncio.get_event_loop() + self._processing_task = loop.create_task(self._process_queue()) async def stop(self): if self._processing_task: From 155c756238ee8efac6808b1c14c08a5638464aa0 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Tue, 26 Nov 2024 17:04:04 +0530 Subject: [PATCH 24/28] issue fix was incorrect. Reverting --- crawl4ai/async_crawler_strategy.py | 3 +-- crawl4ai/scraper/bfs_scraper_strategy.py | 3 +-- main.py | 6 ++---- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index df23c43e..3f332eb0 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -68,8 +68,7 @@ class ManagedBrowser: stderr=subprocess.PIPE ) # Monitor browser process output for errors - loop = asyncio.get_event_loop() - loop.create_task(self._monitor_browser_process()) + asyncio.create_task(self._monitor_browser_process()) await asyncio.sleep(2) # Give browser time to start return f"http://localhost:{self.debugging_port}" except Exception as e: diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index 73a4f8ae..3a6d09a5 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -264,8 +264,7 @@ class BFSScraperStrategy(ScraperStrategy): self.stats.current_depth = depth if parallel_processing: - loop = asyncio.get_event_loop() - task = loop.create_task( + task = asyncio.create_task( self.process_url(url, depth, crawler, queue, visited, depths) ) pending_tasks.add(task) diff --git a/main.py b/main.py index bc5dfe7e..6d217410 100644 --- a/main.py +++ b/main.py @@ -125,8 +125,7 @@ class TaskManager: self.cleanup_task = None async def start(self): - loop = asyncio.get_event_loop() - self.cleanup_task = loop.create_task(self._cleanup_loop()) + self.cleanup_task = asyncio.create_task(self._cleanup_loop()) async def stop(self): if self.cleanup_task: @@ -232,8 +231,7 @@ class CrawlerService: async def start(self): await self.task_manager.start() - loop = asyncio.get_event_loop() - self._processing_task = loop.create_task(self._process_queue()) + self._processing_task = asyncio.create_task(self._process_queue()) async def stop(self): if self._processing_task: From 9530ded83a5fbad05f376e2bcb09cd6ae08cd79c Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Tue, 26 Nov 2024 17:05:54 +0530 Subject: [PATCH 25/28] fixed the final scraper_quickstart.py example --- docs/scraper/scraper_quickstart.py | 107 ++++++++++++++--------------- 1 file changed, 53 insertions(+), 54 deletions(-) diff --git a/docs/scraper/scraper_quickstart.py b/docs/scraper/scraper_quickstart.py index 811f997e..d92124f2 100644 --- a/docs/scraper/scraper_quickstart.py +++ b/docs/scraper/scraper_quickstart.py @@ -19,7 +19,7 @@ async def basic_scraper_example(): # Create a simple filter chain filter_chain = FilterChain([ # Only crawl pages within the blog section - # URLPatternFilter("*/tutorial/*"), + URLPatternFilter("*/tutorial/*"), # Only process HTML pages ContentTypeFilter(["text/html"]) ]) @@ -29,7 +29,8 @@ async def basic_scraper_example(): max_depth=2, # Only go 2 levels deep filter_chain=filter_chain, url_scorer=None, # Use default scoring - max_concurrent=3 # Limit concurrent requests + max_concurrent=3, # Limit concurrent requests + process_external_links=True ) # Create the crawler and scraper @@ -79,8 +80,8 @@ async def advanced_scraper_example(): filter_chain = FilterChain([ # Domain control DomainFilter( - allowed_domains=["example.com", "blog.example.com"], - blocked_domains=["ads.example.com", "tracker.example.com"] + allowed_domains=["techcrunch.com"], + blocked_domains=["login.techcrunch.com","legal.yahoo.com"] ), # URL patterns URLPatternFilter([ @@ -114,7 +115,7 @@ async def advanced_scraper_example(): # Initialize strategy with advanced configuration strategy = BFSScraperStrategy( - max_depth=4, + max_depth=2, filter_chain=filter_chain, url_scorer=scorer, max_concurrent=5, @@ -122,63 +123,61 @@ async def advanced_scraper_example(): ) # Create crawler and scraper - crawler = AsyncWebCrawler() - scraper = AsyncWebScraper(crawler, strategy) + async with AsyncWebCrawler(verbose=True) as crawler: + scraper = AsyncWebScraper(crawler, strategy) - # Track statistics - stats = { - 'processed': 0, - 'errors': 0, - 'total_size': 0 - } + # Track statistics + stats = { + 'processed': 0, + 'errors': 0, + 'total_size': 0 + } - try: - # Use streaming mode - async for result in scraper.ascrape("https://example.com/news/", stream=True): - stats['processed'] += 1 - - if result.success: - stats['total_size'] += len(result.html) - logger.info(f"Processed: {result.url}") + try: + # Use streaming mode + result_generator = await scraper.ascrape("https://techcrunch.com", parallel_processing=True, stream=True) + async for result in result_generator: + stats['processed'] += 1 - # Print scoring information - for scorer_name, score in result.scores.items(): - logger.debug(f"{scorer_name}: {score:.2f}") - else: - stats['errors'] += 1 - logger.error(f"Failed to process {result.url}: {result.error_message}") + if result.success: + stats['total_size'] += len(result.html) + logger.info(f"Processed: {result.url}") + else: + stats['errors'] += 1 + logger.error(f"Failed to process {result.url}: {result.error_message}") - # Log progress regularly - if stats['processed'] % 10 == 0: - logger.info(f"Progress: {stats['processed']} URLs processed") + # Log progress regularly + if stats['processed'] % 10 == 0: + logger.info(f"Progress: {stats['processed']} URLs processed") - except Exception as e: - logger.error(f"Scraping error: {e}") - - finally: - # Print final statistics - logger.info("Scraping completed:") - logger.info(f"- URLs processed: {stats['processed']}") - logger.info(f"- Errors: {stats['errors']}") - logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB") + except Exception as e: + logger.error(f"Scraping error: {e}") - # Print filter statistics - for filter_ in filter_chain.filters: - logger.info(f"{filter_.name} stats:") - logger.info(f"- Passed: {filter_.stats.passed_urls}") - logger.info(f"- Rejected: {filter_.stats.rejected_urls}") - - # Print scorer statistics - logger.info("Scoring statistics:") - logger.info(f"- Average score: {scorer.stats.average_score:.2f}") - logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}") + finally: + # Print final statistics + logger.info("Scraping completed:") + logger.info(f"- URLs processed: {stats['processed']}") + logger.info(f"- Errors: {stats['errors']}") + logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB") + + # Print filter statistics + for filter_ in filter_chain.filters: + logger.info(f"{filter_.name} stats:") + logger.info(f"- Passed: {filter_.stats.passed_urls}") + logger.info(f"- Rejected: {filter_.stats.rejected_urls}") + + # Print scorer statistics + logger.info("Scoring statistics:") + logger.info(f"- Average score: {scorer.stats.average_score:.2f}") + logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}") if __name__ == "__main__": import asyncio # Run basic example - print("Running basic scraper example...") - asyncio.run(basic_scraper_example()) - - # print("\nRunning advanced scraper example...") - # asyncio.run(advanced_scraper_example()) \ No newline at end of file + # print("Running basic scraper example...") + # asyncio.run(basic_scraper_example()) + + # Run advanced example + print("\nRunning advanced scraper example...") + asyncio.run(advanced_scraper_example()) \ No newline at end of file From ff731e4ea1f61864f3bbe72d6cd0695ed4ecd7b5 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Tue, 26 Nov 2024 17:08:32 +0530 Subject: [PATCH 26/28] fixed the final scraper_quickstart.py example --- docs/scraper/scraper_quickstart.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/scraper/scraper_quickstart.py b/docs/scraper/scraper_quickstart.py index d92124f2..9e380cf5 100644 --- a/docs/scraper/scraper_quickstart.py +++ b/docs/scraper/scraper_quickstart.py @@ -175,8 +175,8 @@ if __name__ == "__main__": import asyncio # Run basic example - # print("Running basic scraper example...") - # asyncio.run(basic_scraper_example()) + print("Running basic scraper example...") + asyncio.run(basic_scraper_example()) # Run advanced example print("\nRunning advanced scraper example...") From 2f5e0598bbbcf8dacaee65ebf70cb98c7c699fa5 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Tue, 26 Nov 2024 18:26:57 +0530 Subject: [PATCH 27/28] updated definition of can_process_url to include dept as an argument, as it's needed to skip filters for start_url --- crawl4ai/scraper/scraper_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/scraper/scraper_strategy.py b/crawl4ai/scraper/scraper_strategy.py index f1588f0c..396ea7c4 100644 --- a/crawl4ai/scraper/scraper_strategy.py +++ b/crawl4ai/scraper/scraper_strategy.py @@ -32,7 +32,7 @@ class ScraperStrategy(ABC): pass @abstractmethod - async def can_process_url(self, url: str) -> bool: + async def can_process_url(self, url: str, depth: int) -> bool: """Check if URL can be processed based on strategy rules""" pass From 7a5f83b76f29a21ba8a0d1b66b822e8d04905f39 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Wed, 18 Dec 2024 10:33:09 +0530 Subject: [PATCH 28/28] fix: Added browser config and crawler run config from 0.4.22 --- crawl4ai/scraper/bfs_scraper_strategy.py | 5 +++-- docs/scraper/scraper_quickstart.py | 10 ++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index 3a6d09a5..eb7f4cd8 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -14,7 +14,7 @@ from collections import defaultdict from .models import CrawlResult from .filters import FilterChain from .scorers import URLScorer -from ..async_webcrawler import AsyncWebCrawler +from ..async_webcrawler import AsyncWebCrawler, CrawlerRunConfig from .scraper_strategy import ScraperStrategy @dataclass @@ -116,7 +116,8 @@ class BFSScraperStrategy(ScraperStrategy): ) -> CrawlResult: """Crawl URL with retry logic""" try: - return await asyncio.wait_for(crawler.arun(url), timeout=self.timeout) + crawler_config = CrawlerRunConfig(cache_mode="BYPASS") + return await asyncio.wait_for(crawler.arun(url, config=crawler_config), timeout=self.timeout) except asyncio.TimeoutError: self.logger.error(f"Timeout crawling {url}") raise diff --git a/docs/scraper/scraper_quickstart.py b/docs/scraper/scraper_quickstart.py index 9e380cf5..f6100e51 100644 --- a/docs/scraper/scraper_quickstart.py +++ b/docs/scraper/scraper_quickstart.py @@ -6,9 +6,11 @@ from crawl4ai.scraper import ( URLPatternFilter, ContentTypeFilter ) -from crawl4ai.async_webcrawler import AsyncWebCrawler +from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig import re +browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600) + async def basic_scraper_example(): """ Basic example: Scrape a blog site for articles @@ -34,7 +36,7 @@ async def basic_scraper_example(): ) # Create the crawler and scraper - async with AsyncWebCrawler(verbose=True) as crawler: + async with AsyncWebCrawler(config=browser_config,verbose=True) as crawler: scraper = AsyncWebScraper(crawler, strategy) # Start scraping try: @@ -118,12 +120,12 @@ async def advanced_scraper_example(): max_depth=2, filter_chain=filter_chain, url_scorer=scorer, - max_concurrent=5, + max_concurrent=2, min_crawl_delay=1 ) # Create crawler and scraper - async with AsyncWebCrawler(verbose=True) as crawler: + async with AsyncWebCrawler(verbose=True, config=browser_config) as crawler: scraper = AsyncWebScraper(crawler, strategy) # Track statistics